code-travail/code-travail-txt2xml.py

#! /usr/bin/env python3
import re
import xml

import yaml

# Première partie : Les relations individuelles de travail
re_partie = re.compile("^[^ ]+ partie :")

# Livre Ier : Dispositions préliminaires
re_livre = re.compile("^Livre")

# Titre Ier : Champ d'application et calcul des seuils d'effectifs
re_titre = re.compile("^Titre")

# Chapitre unique.
re_chapitre = re.compile("^Chapitre")

# Chapitre unique.
re_chapitre = re.compile("^Chapitre")

# Section 3 : Organismes consultatifs
re_section = re.compile("^Section ")

# Article L1111-1
re_article = re.compile("^Article")


class Doc():
    content: dict = {}
    partie: str = ""
    livre: str = ""
    titre: str = ""
    chapitre: str = ""
    section: str = ""
    article: str = ""
    text: str = ""

    splits: list = []

    def set_partie(self, arg: str):
        self.partie = arg
        self.livre = self.titre = self.chapitre = self.section = self.article = self.text = ""

    def set_livre(self, arg: str):
        self.livre = arg
        self.titre = self.chapitre = self.section = self.article = self.text = ""

    def set_titre(self, arg: str):
        self.titre = arg
        self.chapitre = self.section = self.article = self.text = ""

    def set_chapitre(self, arg: str):
        self.chapitre = arg
        self.section = self.article = self.text = ""

    def set_section(self, arg: str):
        self.section = arg
        self.article = self.text = ""

    def set_article(self, arg: str):
        self.article = arg
        self.text = ""

    def set_text(self, arg: str):
        # Certains livre n'ont pas de chapitre manifest
        if not self.chapitre:
            self.chapitre = "Chapitre unique"
        if not self.section:
            self.section = "Section unique"
        self.text += arg + " "

        if not self.partie in self.content:
            self.content[self.partie] = {}
        if not self.livre in self.content[self.partie]:
            self.content[self.partie][self.livre] = {}
        if not self.titre in self.content[self.partie][self.livre]:
            self.content[self.partie][self.livre][self.titre] = {}
        if not self.chapitre in self.content[self.partie][self.livre][self.titre]:
            self.content[self.partie][self.livre][self.titre][self.chapitre] = {}
        if not self.section in self.content[self.partie][self.livre][self.titre][self.chapitre]:
            self.content[self.partie][self.livre][self.titre][self.chapitre][self.section] = {}
        if not self.article in self.content[self.partie][self.livre][self.titre][self.chapitre][self.section]:
            self.content[self.partie][self.livre][self.titre][self.chapitre][self.section][self.article] = ""
        self.content[self.partie][self.livre][self.titre][self.chapitre][self.section][self.article] = self.text


def parse_line(line: str, doc: Doc) -> int:
    if re_partie.match(line):
        doc.set_partie(line)
    elif re_livre.match(line):
        doc.set_livre(line)
    elif re_titre.match(line):
        doc.set_titre(line)
    elif re_chapitre.match(line):
        doc.set_chapitre(line)
    elif re_section.match(line):
        doc.set_section(line)
    elif re_article.match(line):
        doc.set_article(line)
    else:
        doc.set_text(line)


# Read the text as list of line
with open("./sources/code-travail-2022.txt") as fh:
    lines = fh.readlines()

doc = Doc()
# Parse the line to get its type and value
for line in lines:
    line = line.rstrip()
    if not line:
        continue
    parse_line(line, doc)

# with open("./sources/code-travail.yaml", "w") as fh:
#     yaml.safe_dump(doc.content, fh)

import xml.etree.ElementTree as ET

# Create the root element of the XML tree
root = ET.Element('data')


# Iterate through the list of dictionaries

for partie, partie_value in doc.content.items():
    for livre, livre_value in partie_value.items():
        for titre, titre_value in livre_value.items():
            for chapitre, chapitre_value in titre_value.items():
                for section, section_value in chapitre_value.items():
                    for article, article_value in section_value.items():
                            entry = ET.SubElement(root, 'entry')
                            #
                            # Add the "metadata" as child elements of "entry"
                            metadata = ET.SubElement(entry, 'metadata')
                            metadata_content = {
                                "partie": partie,
                                "livre": livre,
                                "titre": titre,
                                "chapitre": chapitre,
                                "section": section,
                                "article": article
                            }
                            for key, value in metadata_content.items():
                                element = ET.SubElement(metadata, key)
                                element.text = value

                            # # Add "page_content" as a child element of "entry"
                            page_content = ET.SubElement(entry, 'page_content')
                            page_content.text = article_value

# Create an ElementTree object
tree = ET.ElementTree(root)

# Specify the XML file name
xml_filename = './sources/code-travail.xml'

# Save the XML tree to a file
tree.write('./sources/code-travail.xml')