#! /usr/bin/env python3 import re import xml import yaml # Première partie : Les relations individuelles de travail re_partie = re.compile("^[^ ]+ partie :") # Livre Ier : Dispositions préliminaires re_livre = re.compile("^Livre") # Titre Ier : Champ d'application et calcul des seuils d'effectifs re_titre = re.compile("^Titre") # Chapitre unique. re_chapitre = re.compile("^Chapitre") # Chapitre unique. re_chapitre = re.compile("^Chapitre") # Section 3 : Organismes consultatifs re_section = re.compile("^Section ") # Article L1111-1 re_article = re.compile("^Article") class Doc(): content: dict = {} partie: str = "" livre: str = "" titre: str = "" chapitre: str = "" section: str = "" article: str = "" text: str = "" splits: list = [] def set_partie(self, arg: str): self.partie = arg self.livre = self.titre = self.chapitre = self.section = self.article = self.text = "" def set_livre(self, arg: str): self.livre = arg self.titre = self.chapitre = self.section = self.article = self.text = "" def set_titre(self, arg: str): self.titre = arg self.chapitre = self.section = self.article = self.text = "" def set_chapitre(self, arg: str): self.chapitre = arg self.section = self.article = self.text = "" def set_section(self, arg: str): self.section = arg self.article = self.text = "" def set_article(self, arg: str): self.article = arg self.text = "" def set_text(self, arg: str): # Certains livre n'ont pas de chapitre manifest if not self.chapitre: self.chapitre = "Chapitre unique" if not self.section: self.section = "Section unique" self.text += arg + " " if not self.partie in self.content: self.content[self.partie] = {} if not self.livre in self.content[self.partie]: self.content[self.partie][self.livre] = {} if not self.titre in self.content[self.partie][self.livre]: self.content[self.partie][self.livre][self.titre] = {} if not self.chapitre in self.content[self.partie][self.livre][self.titre]: self.content[self.partie][self.livre][self.titre][self.chapitre] = {} if not self.section in self.content[self.partie][self.livre][self.titre][self.chapitre]: self.content[self.partie][self.livre][self.titre][self.chapitre][self.section] = {} if not self.article in self.content[self.partie][self.livre][self.titre][self.chapitre][self.section]: self.content[self.partie][self.livre][self.titre][self.chapitre][self.section][self.article] = "" self.content[self.partie][self.livre][self.titre][self.chapitre][self.section][self.article] = self.text def parse_line(line: str, doc: Doc) -> int: if re_partie.match(line): doc.set_partie(line) elif re_livre.match(line): doc.set_livre(line) elif re_titre.match(line): doc.set_titre(line) elif re_chapitre.match(line): doc.set_chapitre(line) elif re_section.match(line): doc.set_section(line) elif re_article.match(line): doc.set_article(line) else: doc.set_text(line) # Read the text as list of line with open("./sources/code-travail-2022.txt") as fh: lines = fh.readlines() doc = Doc() # Parse the line to get its type and value for line in lines: line = line.rstrip() if not line: continue parse_line(line, doc) # with open("./sources/code-travail.yaml", "w") as fh: # yaml.safe_dump(doc.content, fh) import xml.etree.ElementTree as ET # Create the root element of the XML tree root = ET.Element('data') # Iterate through the list of dictionaries for partie, partie_value in doc.content.items(): for livre, livre_value in partie_value.items(): for titre, titre_value in livre_value.items(): for chapitre, chapitre_value in titre_value.items(): for section, section_value in chapitre_value.items(): for article, article_value in section_value.items(): entry = ET.SubElement(root, 'entry') # # Add the "metadata" as child elements of "entry" metadata = ET.SubElement(entry, 'metadata') metadata_content = { "partie": partie, "livre": livre, "titre": titre, "chapitre": chapitre, "section": section, "article": article } for key, value in metadata_content.items(): element = ET.SubElement(metadata, key) element.text = value # # Add "page_content" as a child element of "entry" page_content = ET.SubElement(entry, 'page_content') page_content.text = article_value # Create an ElementTree object tree = ET.ElementTree(root) # Specify the XML file name xml_filename = './sources/code-travail.xml' # Save the XML tree to a file tree.write('./sources/code-travail.xml')