164 lines
5.3 KiB
Python
164 lines
5.3 KiB
Python
#! /usr/bin/env python3
|
|
import re
|
|
import xml
|
|
|
|
import yaml
|
|
|
|
# Première partie : Les relations individuelles de travail
|
|
re_partie = re.compile("^[^ ]+ partie :")
|
|
|
|
# Livre Ier : Dispositions préliminaires
|
|
re_livre = re.compile("^Livre")
|
|
|
|
# Titre Ier : Champ d'application et calcul des seuils d'effectifs
|
|
re_titre = re.compile("^Titre")
|
|
|
|
# Chapitre unique.
|
|
re_chapitre = re.compile("^Chapitre")
|
|
|
|
# Chapitre unique.
|
|
re_chapitre = re.compile("^Chapitre")
|
|
|
|
# Section 3 : Organismes consultatifs
|
|
re_section = re.compile("^Section ")
|
|
|
|
# Article L1111-1
|
|
re_article = re.compile("^Article")
|
|
|
|
|
|
class Doc():
|
|
content: dict = {}
|
|
partie: str = ""
|
|
livre: str = ""
|
|
titre: str = ""
|
|
chapitre: str = ""
|
|
section: str = ""
|
|
article: str = ""
|
|
text: str = ""
|
|
|
|
splits: list = []
|
|
|
|
def set_partie(self, arg: str):
|
|
self.partie = arg
|
|
self.livre = self.titre = self.chapitre = self.section = self.article = self.text = ""
|
|
|
|
def set_livre(self, arg: str):
|
|
self.livre = arg
|
|
self.titre = self.chapitre = self.section = self.article = self.text = ""
|
|
|
|
def set_titre(self, arg: str):
|
|
self.titre = arg
|
|
self.chapitre = self.section = self.article = self.text = ""
|
|
|
|
def set_chapitre(self, arg: str):
|
|
self.chapitre = arg
|
|
self.section = self.article = self.text = ""
|
|
|
|
def set_section(self, arg: str):
|
|
self.section = arg
|
|
self.article = self.text = ""
|
|
|
|
def set_article(self, arg: str):
|
|
self.article = arg
|
|
self.text = ""
|
|
|
|
def set_text(self, arg: str):
|
|
# Certains livre n'ont pas de chapitre manifest
|
|
if not self.chapitre:
|
|
self.chapitre = "Chapitre unique"
|
|
if not self.section:
|
|
self.section = "Section unique"
|
|
self.text += arg + " "
|
|
|
|
if not self.partie in self.content:
|
|
self.content[self.partie] = {}
|
|
if not self.livre in self.content[self.partie]:
|
|
self.content[self.partie][self.livre] = {}
|
|
if not self.titre in self.content[self.partie][self.livre]:
|
|
self.content[self.partie][self.livre][self.titre] = {}
|
|
if not self.chapitre in self.content[self.partie][self.livre][self.titre]:
|
|
self.content[self.partie][self.livre][self.titre][self.chapitre] = {}
|
|
if not self.section in self.content[self.partie][self.livre][self.titre][self.chapitre]:
|
|
self.content[self.partie][self.livre][self.titre][self.chapitre][self.section] = {}
|
|
if not self.article in self.content[self.partie][self.livre][self.titre][self.chapitre][self.section]:
|
|
self.content[self.partie][self.livre][self.titre][self.chapitre][self.section][self.article] = ""
|
|
self.content[self.partie][self.livre][self.titre][self.chapitre][self.section][self.article] = self.text
|
|
|
|
|
|
|
|
def parse_line(line: str, doc: Doc) -> int:
|
|
if re_partie.match(line):
|
|
doc.set_partie(line)
|
|
elif re_livre.match(line):
|
|
doc.set_livre(line)
|
|
elif re_titre.match(line):
|
|
doc.set_titre(line)
|
|
elif re_chapitre.match(line):
|
|
doc.set_chapitre(line)
|
|
elif re_section.match(line):
|
|
doc.set_section(line)
|
|
elif re_article.match(line):
|
|
doc.set_article(line)
|
|
else:
|
|
doc.set_text(line)
|
|
|
|
|
|
# Read the text as list of line
|
|
with open("./sources/code-travail-2022.txt") as fh:
|
|
lines = fh.readlines()
|
|
|
|
doc = Doc()
|
|
# Parse the line to get its type and value
|
|
for line in lines:
|
|
line = line.rstrip()
|
|
if not line:
|
|
continue
|
|
parse_line(line, doc)
|
|
|
|
# with open("./sources/code-travail.yaml", "w") as fh:
|
|
# yaml.safe_dump(doc.content, fh)
|
|
|
|
import xml.etree.ElementTree as ET
|
|
|
|
# Create the root element of the XML tree
|
|
root = ET.Element('data')
|
|
|
|
|
|
# Iterate through the list of dictionaries
|
|
|
|
for partie, partie_value in doc.content.items():
|
|
for livre, livre_value in partie_value.items():
|
|
for titre, titre_value in livre_value.items():
|
|
for chapitre, chapitre_value in titre_value.items():
|
|
for section, section_value in chapitre_value.items():
|
|
for article, article_value in section_value.items():
|
|
entry = ET.SubElement(root, 'entry')
|
|
#
|
|
# Add the "metadata" as child elements of "entry"
|
|
metadata = ET.SubElement(entry, 'metadata')
|
|
metadata_content = {
|
|
"partie": partie,
|
|
"livre": livre,
|
|
"titre": titre,
|
|
"chapitre": chapitre,
|
|
"section": section,
|
|
"article": article
|
|
}
|
|
for key, value in metadata_content.items():
|
|
element = ET.SubElement(metadata, key)
|
|
element.text = value
|
|
|
|
# # Add "page_content" as a child element of "entry"
|
|
page_content = ET.SubElement(entry, 'page_content')
|
|
page_content.text = article_value
|
|
|
|
# Create an ElementTree object
|
|
tree = ET.ElementTree(root)
|
|
|
|
# Specify the XML file name
|
|
xml_filename = './sources/code-travail.xml'
|
|
|
|
# Save the XML tree to a file
|
|
tree.write('./sources/code-travail.xml')
|
|
|