164 lines
		
	
	
		
			5.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			164 lines
		
	
	
		
			5.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| #! /usr/bin/env python3
 | |
| import re
 | |
| import xml
 | |
| 
 | |
| import yaml
 | |
| 
 | |
| # Première partie : Les relations individuelles de travail
 | |
| re_partie = re.compile("^[^ ]+ partie :")
 | |
| 
 | |
| # Livre Ier : Dispositions préliminaires
 | |
| re_livre = re.compile("^Livre")
 | |
| 
 | |
| # Titre Ier : Champ d'application et calcul des seuils d'effectifs
 | |
| re_titre = re.compile("^Titre")
 | |
| 
 | |
| # Chapitre unique.
 | |
| re_chapitre = re.compile("^Chapitre")
 | |
| 
 | |
| # Chapitre unique.
 | |
| re_chapitre = re.compile("^Chapitre")
 | |
| 
 | |
| # Section 3 : Organismes consultatifs
 | |
| re_section = re.compile("^Section ")
 | |
| 
 | |
| # Article L1111-1
 | |
| re_article = re.compile("^Article")
 | |
| 
 | |
| 
 | |
| class Doc():
 | |
|     content: dict = {}
 | |
|     partie: str = ""
 | |
|     livre: str = ""
 | |
|     titre: str = ""
 | |
|     chapitre: str = ""
 | |
|     section: str = ""
 | |
|     article: str = ""
 | |
|     text: str = ""
 | |
| 
 | |
|     splits: list = []
 | |
| 
 | |
|     def set_partie(self, arg: str):
 | |
|         self.partie = arg
 | |
|         self.livre = self.titre = self.chapitre = self.section = self.article = self.text = ""
 | |
| 
 | |
|     def set_livre(self, arg: str):
 | |
|         self.livre = arg
 | |
|         self.titre = self.chapitre = self.section = self.article = self.text = ""
 | |
| 
 | |
|     def set_titre(self, arg: str):
 | |
|         self.titre = arg
 | |
|         self.chapitre = self.section = self.article = self.text = ""
 | |
| 
 | |
|     def set_chapitre(self, arg: str):
 | |
|         self.chapitre = arg
 | |
|         self.section = self.article = self.text = ""
 | |
| 
 | |
|     def set_section(self, arg: str):
 | |
|         self.section = arg
 | |
|         self.article = self.text = ""
 | |
| 
 | |
|     def set_article(self, arg: str):
 | |
|         self.article = arg
 | |
|         self.text = ""
 | |
| 
 | |
|     def set_text(self, arg: str):
 | |
|         # Certains livre n'ont pas de chapitre manifest
 | |
|         if not self.chapitre:
 | |
|             self.chapitre = "Chapitre unique"
 | |
|         if not self.section:
 | |
|             self.section = "Section unique"
 | |
|         self.text += arg + " "
 | |
| 
 | |
|         if not self.partie in self.content:
 | |
|             self.content[self.partie] = {}
 | |
|         if not self.livre in self.content[self.partie]:
 | |
|             self.content[self.partie][self.livre] = {}
 | |
|         if not self.titre in self.content[self.partie][self.livre]:
 | |
|             self.content[self.partie][self.livre][self.titre] = {}
 | |
|         if not self.chapitre in self.content[self.partie][self.livre][self.titre]:
 | |
|             self.content[self.partie][self.livre][self.titre][self.chapitre] = {}
 | |
|         if not self.section in self.content[self.partie][self.livre][self.titre][self.chapitre]:
 | |
|             self.content[self.partie][self.livre][self.titre][self.chapitre][self.section] = {}
 | |
|         if not self.article in self.content[self.partie][self.livre][self.titre][self.chapitre][self.section]:
 | |
|             self.content[self.partie][self.livre][self.titre][self.chapitre][self.section][self.article] = ""
 | |
|         self.content[self.partie][self.livre][self.titre][self.chapitre][self.section][self.article] = self.text
 | |
| 
 | |
| 
 | |
| 
 | |
| def parse_line(line: str, doc: Doc) -> int:
 | |
|     if re_partie.match(line):
 | |
|         doc.set_partie(line)
 | |
|     elif re_livre.match(line):
 | |
|         doc.set_livre(line)
 | |
|     elif re_titre.match(line):
 | |
|         doc.set_titre(line)
 | |
|     elif re_chapitre.match(line):
 | |
|         doc.set_chapitre(line)
 | |
|     elif re_section.match(line):
 | |
|         doc.set_section(line)
 | |
|     elif re_article.match(line):
 | |
|         doc.set_article(line)
 | |
|     else:
 | |
|         doc.set_text(line)
 | |
| 
 | |
| 
 | |
| # Read the text as list of line
 | |
| with open("./sources/code-travail-2022.txt") as fh:
 | |
|     lines = fh.readlines()
 | |
| 
 | |
| doc = Doc()
 | |
| # Parse the line to get its type and value
 | |
| for line in lines:
 | |
|     line = line.rstrip()
 | |
|     if not line:
 | |
|         continue
 | |
|     parse_line(line, doc)
 | |
| 
 | |
| # with open("./sources/code-travail.yaml", "w") as fh:
 | |
| #     yaml.safe_dump(doc.content, fh)
 | |
| 
 | |
| import xml.etree.ElementTree as ET
 | |
| 
 | |
| # Create the root element of the XML tree
 | |
| root = ET.Element('data')
 | |
| 
 | |
| 
 | |
| # Iterate through the list of dictionaries
 | |
| 
 | |
| for partie, partie_value in doc.content.items():
 | |
|     for livre, livre_value in partie_value.items():
 | |
|         for titre, titre_value in livre_value.items():
 | |
|             for chapitre, chapitre_value in titre_value.items():
 | |
|                 for section, section_value in chapitre_value.items():
 | |
|                     for article, article_value in section_value.items():
 | |
|                             entry = ET.SubElement(root, 'entry')
 | |
|                             #
 | |
|                             # Add the "metadata" as child elements of "entry"
 | |
|                             metadata = ET.SubElement(entry, 'metadata')
 | |
|                             metadata_content = {
 | |
|                                 "partie": partie,
 | |
|                                 "livre": livre,
 | |
|                                 "titre": titre,
 | |
|                                 "chapitre": chapitre,
 | |
|                                 "section": section,
 | |
|                                 "article": article
 | |
|                             }
 | |
|                             for key, value in metadata_content.items():
 | |
|                                 element = ET.SubElement(metadata, key)
 | |
|                                 element.text = value
 | |
| 
 | |
|                             # # Add "page_content" as a child element of "entry"
 | |
|                             page_content = ET.SubElement(entry, 'page_content')
 | |
|                             page_content.text = article_value
 | |
| 
 | |
| # Create an ElementTree object
 | |
| tree = ET.ElementTree(root)
 | |
| 
 | |
| # Specify the XML file name
 | |
| xml_filename = './sources/code-travail.xml'
 | |
| 
 | |
| # Save the XML tree to a file
 | |
| tree.write('./sources/code-travail.xml')
 | |
| 
 |