Add CSV export of mariage table and merged GEDCOM
- export_mariages_csv.py : extrait la table `mariage` depuis basesgen.sql en 4 fichiers CSV classés par type d'acte (BMS, CM+TABLE CM, TABLE MARIAGES, ETAT CIVIL) ; ajoute 4 colonnes dérivées PERE/MERE_EPOUX/EPOUSE_DECEDE(E) détectées via le suffixe '+' sur les noms de parents ; encodage UTF-8 (corrige la lecture latin-1 initiale qui produisait des artefacts type "Félix") - csv_export/ : 4 fichiers générés (380 892 enregistrements au total) - merge_gedcom.py : fusionne les GEDCOM individuels en renumérotant les identifiants INDI/FAM pour éviter les collisions - lignees.ged : fusion des 16 exports (4 299 individus, 1 484 familles) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,287 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Export de la table `mariage` depuis basesgen.sql vers 4 fichiers CSV
|
||||
classés par type d'acte :
|
||||
- mariages_religieux.csv (BMS)
|
||||
- contrats_de_mariages.csv (CM + TABLE CM)
|
||||
- tables_de_mariages.csv (TABLE MARIAGES)
|
||||
- etat_civil.csv (ETAT CIVIL)
|
||||
"""
|
||||
|
||||
import csv
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
|
||||
SQL_FILE = "/home/yannick/Téléchargements/basesgen.sql"
|
||||
OUTPUT_DIR = "/home/yannick/BasesCGL/csv_export"
|
||||
|
||||
# Colonnes telles qu'elles existent dans le SQL (26 colonnes)
|
||||
SQL_COLUMNS = [
|
||||
"CLE_MARIAGE", "JOUR_MARIAGE", "MOIS_MARIAGE", "ANNEE_MARIAGE",
|
||||
"NOM_EPOUX", "PRENOM_EPOUX", "AGE_EPOUX", "PROF_EPOUX",
|
||||
"DATE_NAISSANCE_EPOUX", "VILLE_EPOUX",
|
||||
"NOM_PERE_EPOUX", "NOM_MERE_EPOUX", "VEUF_EPOUX",
|
||||
"NOM_EPOUSE", "PRENOM_EPOUSE", "AGE_EPOUSE", "PROF_EPOUSE",
|
||||
"DATE_NAISSANCE_EPOUSE", "VILLE_EPOUSE",
|
||||
"NOM_PERE_EPOUSE", "NOM_MERE_EPOUSE", "VEUVE_EPOUSE",
|
||||
"OBSERVATION_MARIAGE", "LIEU_ACTE", "TYPE_ACTE_BRUT", "CODE_INSEE",
|
||||
]
|
||||
|
||||
# Colonnes en sortie CSV (30 colonnes) : 4 colonnes dérivées insérées après chaque paire de parents
|
||||
CSV_COLUMNS = [
|
||||
"CLE_MARIAGE", "JOUR_MARIAGE", "MOIS_MARIAGE", "ANNEE_MARIAGE",
|
||||
"NOM_EPOUX", "PRENOM_EPOUX", "AGE_EPOUX", "PROF_EPOUX",
|
||||
"DATE_NAISSANCE_EPOUX", "VILLE_EPOUX",
|
||||
"NOM_PERE_EPOUX", "PERE_EPOUX_DECEDE", "NOM_MERE_EPOUX", "MERE_EPOUX_DECEDEE", "VEUF_EPOUX",
|
||||
"NOM_EPOUSE", "PRENOM_EPOUSE", "AGE_EPOUSE", "PROF_EPOUSE",
|
||||
"DATE_NAISSANCE_EPOUSE", "VILLE_EPOUSE",
|
||||
"NOM_PERE_EPOUSE", "PERE_EPOUSE_DECEDE", "NOM_MERE_EPOUSE", "MERE_EPOUSE_DECEDEE", "VEUVE_EPOUSE",
|
||||
"OBSERVATION_MARIAGE", "LIEU_ACTE", "TYPE_ACTE_BRUT", "CODE_INSEE",
|
||||
]
|
||||
|
||||
TYPE_ACTE_IDX = 24 # index de TYPE_ACTE dans la ligne SQL parsée (0-based)
|
||||
|
||||
# Indices SQL des noms de parents (pour détecter le '+')
|
||||
IDX_PERE_EPOUX = 10
|
||||
IDX_MERE_EPOUX = 11
|
||||
IDX_PERE_EPOUSE = 19
|
||||
IDX_MERE_EPOUSE = 20
|
||||
|
||||
# Mapping catégorie → nom de fichier
|
||||
OUTPUT_FILES = {
|
||||
"BMS": "mariages_religieux.csv",
|
||||
"CM": "contrats_de_mariages.csv",
|
||||
"TABLES": "tables_de_mariages.csv",
|
||||
"ETAT_CIVIL": "etat_civil.csv",
|
||||
}
|
||||
|
||||
|
||||
def classify(type_acte: str) -> str | None:
|
||||
# Normaliser les espaces multiples pour la classification
|
||||
import re as _re
|
||||
ta = _re.sub(r'\s+', ' ', type_acte.strip()).upper()
|
||||
if ta.startswith("BMS"):
|
||||
return "BMS"
|
||||
if ta.startswith("ETAT CIVIL") or ta.startswith("ETAT CILVIL"):
|
||||
return "ETAT_CIVIL"
|
||||
if ta.startswith("TABLE CM") or ta.startswith("TABLE C"):
|
||||
return "CM" # fusionné avec Contrats de Mariages
|
||||
if ta.startswith("TABLE"):
|
||||
return "TABLES" # TABLE MARIAGES, TABLE PUBLICATIONS DE MARIAGES, etc.
|
||||
if ta.startswith("CM"):
|
||||
return "CM"
|
||||
return None
|
||||
|
||||
|
||||
def parse_mysql_value(src: str, pos: int) -> tuple[str, int]:
|
||||
"""Parse a single MySQL value starting at pos; return (value_str, next_pos)."""
|
||||
# skip whitespace
|
||||
while pos < len(src) and src[pos] in (' ', '\t'):
|
||||
pos += 1
|
||||
|
||||
if pos >= len(src):
|
||||
return ("", pos)
|
||||
|
||||
if src[pos] == "'":
|
||||
# quoted string
|
||||
pos += 1
|
||||
buf = []
|
||||
while pos < len(src):
|
||||
ch = src[pos]
|
||||
if ch == '\\' and pos + 1 < len(src):
|
||||
nxt = src[pos + 1]
|
||||
if nxt == "'":
|
||||
buf.append("'")
|
||||
elif nxt == '\\':
|
||||
buf.append('\\')
|
||||
elif nxt == 'n':
|
||||
buf.append('\n')
|
||||
elif nxt == 'r':
|
||||
buf.append('\r')
|
||||
elif nxt == 't':
|
||||
buf.append('\t')
|
||||
else:
|
||||
buf.append(nxt)
|
||||
pos += 2
|
||||
elif ch == "'":
|
||||
pos += 1
|
||||
break
|
||||
else:
|
||||
buf.append(ch)
|
||||
pos += 1
|
||||
return (''.join(buf), pos)
|
||||
|
||||
if src[pos:pos + 4] == 'NULL':
|
||||
return ('', pos + 4)
|
||||
|
||||
# integer (possibly negative)
|
||||
j = pos
|
||||
if j < len(src) and src[j] == '-':
|
||||
j += 1
|
||||
while j < len(src) and src[j].isdigit():
|
||||
j += 1
|
||||
return (src[pos:j], j)
|
||||
|
||||
|
||||
def parse_row(line: str) -> list[str] | None:
|
||||
"""Parse one INSERT tuple line into a list of string values, or None if not a data row."""
|
||||
line = line.strip()
|
||||
if not line.startswith('('):
|
||||
return None
|
||||
# strip trailing ), or );
|
||||
if line.endswith(');'):
|
||||
inner = line[1:-2]
|
||||
elif line.endswith('),'):
|
||||
inner = line[1:-2]
|
||||
elif line.endswith(')'):
|
||||
inner = line[1:-1]
|
||||
else:
|
||||
return None
|
||||
|
||||
values = []
|
||||
pos = 0
|
||||
while pos < len(inner):
|
||||
# skip whitespace and commas between values
|
||||
while pos < len(inner) and inner[pos] in (' ', '\t'):
|
||||
pos += 1
|
||||
if pos >= len(inner):
|
||||
break
|
||||
if inner[pos] == ',':
|
||||
pos += 1
|
||||
continue
|
||||
val, pos = parse_mysql_value(inner, pos)
|
||||
values.append(val)
|
||||
|
||||
return values if len(values) == len(SQL_COLUMNS) else None
|
||||
|
||||
|
||||
def deceased_flag(name: str) -> str:
|
||||
"""Renvoie '1' si le nom se termine par '+', '' sinon."""
|
||||
return "1" if name.strip().endswith("+") else ""
|
||||
|
||||
|
||||
def enrich_row(sql_row: list[str]) -> list[str]:
|
||||
"""Insère les 4 colonnes dérivées dans la liste SQL (26 → 30 colonnes)."""
|
||||
r = sql_row
|
||||
return [
|
||||
# Époux (indices SQL 0-9 inchangés)
|
||||
r[0], r[1], r[2], r[3],
|
||||
r[4], r[5], r[6], r[7], r[8], r[9],
|
||||
# Parents époux + flags décès
|
||||
r[IDX_PERE_EPOUX], deceased_flag(r[IDX_PERE_EPOUX]),
|
||||
r[IDX_MERE_EPOUX], deceased_flag(r[IDX_MERE_EPOUX]),
|
||||
r[12], # VEUF_EPOUX
|
||||
# Épouse (SQL 13-18)
|
||||
r[13], r[14], r[15], r[16], r[17], r[18],
|
||||
# Parents épouse + flags décès
|
||||
r[IDX_PERE_EPOUSE], deceased_flag(r[IDX_PERE_EPOUSE]),
|
||||
r[IDX_MERE_EPOUSE], deceased_flag(r[IDX_MERE_EPOUSE]),
|
||||
r[21], # VEUVE_EPOUSE
|
||||
# Reste (SQL 22-25)
|
||||
r[22], r[23], r[24], r[25],
|
||||
]
|
||||
|
||||
|
||||
def main():
|
||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||
|
||||
file_size = os.path.getsize(SQL_FILE)
|
||||
|
||||
# Ouvrir les 4 CSV en sortie (UTF-8 avec BOM pour compatibilité Excel)
|
||||
handles = {}
|
||||
writers = {}
|
||||
for key, fname in OUTPUT_FILES.items():
|
||||
path = os.path.join(OUTPUT_DIR, fname)
|
||||
fh = open(path, 'w', encoding='utf-8-sig', newline='')
|
||||
writer = csv.writer(fh, quoting=csv.QUOTE_ALL)
|
||||
writer.writerow(CSV_COLUMNS)
|
||||
handles[key] = fh
|
||||
writers[key] = writer
|
||||
|
||||
counts = {k: 0 for k in OUTPUT_FILES}
|
||||
skipped = 0
|
||||
anomalies = []
|
||||
|
||||
start = time.time()
|
||||
bytes_read = 0
|
||||
in_mariage_insert = False
|
||||
last_pct = -1
|
||||
|
||||
print(f"Lecture de {SQL_FILE} ({file_size / 1e9:.2f} Go)…")
|
||||
|
||||
with open(SQL_FILE, encoding='utf-8', errors='replace') as f:
|
||||
for line in f:
|
||||
bytes_read += len(line.encode('utf-8', errors='replace'))
|
||||
|
||||
# Affichage de la progression tous les 5 %
|
||||
pct = int(bytes_read * 100 / file_size)
|
||||
if pct != last_pct and pct % 5 == 0:
|
||||
elapsed = time.time() - start
|
||||
total_est = int(elapsed / max(bytes_read, 1) * file_size)
|
||||
remaining = max(0, total_est - int(elapsed))
|
||||
print(f" {pct:3d}% ({bytes_read / 1e9:.2f} Go) "
|
||||
f"~{remaining // 60}m{remaining % 60:02d}s restant", flush=True)
|
||||
last_pct = pct
|
||||
|
||||
# Détection des blocs INSERT de la table mariage
|
||||
if 'INSERT INTO `mariage`' in line:
|
||||
in_mariage_insert = True
|
||||
continue
|
||||
|
||||
if not in_mariage_insert:
|
||||
continue
|
||||
|
||||
# Fin du bloc INSERT
|
||||
if line.strip() == '' or line.strip().startswith('--') or line.strip().startswith('/*!'):
|
||||
if not line.strip().startswith('('):
|
||||
in_mariage_insert = False
|
||||
continue
|
||||
|
||||
row = parse_row(line)
|
||||
|
||||
if row is None:
|
||||
if line.strip().startswith('('):
|
||||
skipped += 1
|
||||
anomalies.append(line.strip()[:120])
|
||||
continue
|
||||
|
||||
type_acte = row[TYPE_ACTE_IDX]
|
||||
cat = classify(type_acte)
|
||||
|
||||
if cat is None:
|
||||
skipped += 1
|
||||
anomalies.append(f"TYPE_ACTE inconnu: {repr(type_acte)}")
|
||||
continue
|
||||
|
||||
writers[cat].writerow(enrich_row(row))
|
||||
counts[cat] += 1
|
||||
|
||||
# Fin de bloc (dernière ligne se termine par ';')
|
||||
if line.rstrip().endswith(';'):
|
||||
in_mariage_insert = False
|
||||
|
||||
for fh in handles.values():
|
||||
fh.close()
|
||||
|
||||
elapsed = int(time.time() - start)
|
||||
total = sum(counts.values())
|
||||
|
||||
print(f"\n{'='*55}")
|
||||
print(f"Export terminé en {elapsed // 60}m{elapsed % 60:02d}s")
|
||||
print(f"{'='*55}")
|
||||
for key, fname in OUTPUT_FILES.items():
|
||||
path = os.path.join(OUTPUT_DIR, fname)
|
||||
size_kb = os.path.getsize(path) // 1024
|
||||
print(f" {fname:<35s} {counts[key]:>7,} lignes ({size_kb:>6,} Ko)")
|
||||
print(f" {'TOTAL':<35s} {total:>7,} lignes")
|
||||
if skipped:
|
||||
print(f"\n ⚠ {skipped} enregistrement(s) ignoré(s) :")
|
||||
for a in anomalies[:10]:
|
||||
print(f" {a}")
|
||||
if len(anomalies) > 10:
|
||||
print(f" … et {len(anomalies) - 10} autres")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user