Add CSV export of mariage table and merged GEDCOM

- export_mariages_csv.py : extrait la table `mariage` depuis basesgen.sql en 4 fichiers CSV classés par type d'acte (BMS, CM+TABLE CM, TABLE MARIAGES, ETAT CIVIL) ; ajoute 4 colonnes dérivées PERE/MERE_EPOUX/EPOUSE_DECEDE(E) détectées via le suffixe '+' sur les noms de parents ; encodage UTF-8 (corrige la lecture latin-1 initiale qui produisait des artefacts type "FÃ©lix") - csv_export/ : 4 fichiers générés (380 892 enregistrements au total) - merge_gedcom.py : fusionne les GEDCOM individuels en renumérotant les identifiants INDI/FAM pour éviter les collisions - lignees.ged : fusion des 16 exports (4 299 individus, 1 484 familles) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-28 15:22:28 +02:00
parent f75cbebb44
commit e467b9662a
7 changed files with 435895 additions and 0 deletions
@@ -0,0 +1,287 @@
+#!/usr/bin/env python3
+"""
+Export de la table `mariage` depuis basesgen.sql vers 4 fichiers CSV
+classés par type d'acte :
+  - mariages_religieux.csv   (BMS)
+  - contrats_de_mariages.csv (CM + TABLE CM)
+  - tables_de_mariages.csv   (TABLE MARIAGES)
+  - etat_civil.csv           (ETAT CIVIL)
+"""
+
+import csv
+import os
+import sys
+import time
+
+SQL_FILE = "/home/yannick/Téléchargements/basesgen.sql"
+OUTPUT_DIR = "/home/yannick/BasesCGL/csv_export"
+
+# Colonnes telles qu'elles existent dans le SQL (26 colonnes)
+SQL_COLUMNS = [
+    "CLE_MARIAGE", "JOUR_MARIAGE", "MOIS_MARIAGE", "ANNEE_MARIAGE",
+    "NOM_EPOUX", "PRENOM_EPOUX", "AGE_EPOUX", "PROF_EPOUX",
+    "DATE_NAISSANCE_EPOUX", "VILLE_EPOUX",
+    "NOM_PERE_EPOUX", "NOM_MERE_EPOUX", "VEUF_EPOUX",
+    "NOM_EPOUSE", "PRENOM_EPOUSE", "AGE_EPOUSE", "PROF_EPOUSE",
+    "DATE_NAISSANCE_EPOUSE", "VILLE_EPOUSE",
+    "NOM_PERE_EPOUSE", "NOM_MERE_EPOUSE", "VEUVE_EPOUSE",
+    "OBSERVATION_MARIAGE", "LIEU_ACTE", "TYPE_ACTE_BRUT", "CODE_INSEE",
+]
+
+# Colonnes en sortie CSV (30 colonnes) : 4 colonnes dérivées insérées après chaque paire de parents
+CSV_COLUMNS = [
+    "CLE_MARIAGE", "JOUR_MARIAGE", "MOIS_MARIAGE", "ANNEE_MARIAGE",
+    "NOM_EPOUX", "PRENOM_EPOUX", "AGE_EPOUX", "PROF_EPOUX",
+    "DATE_NAISSANCE_EPOUX", "VILLE_EPOUX",
+    "NOM_PERE_EPOUX", "PERE_EPOUX_DECEDE", "NOM_MERE_EPOUX", "MERE_EPOUX_DECEDEE", "VEUF_EPOUX",
+    "NOM_EPOUSE", "PRENOM_EPOUSE", "AGE_EPOUSE", "PROF_EPOUSE",
+    "DATE_NAISSANCE_EPOUSE", "VILLE_EPOUSE",
+    "NOM_PERE_EPOUSE", "PERE_EPOUSE_DECEDE", "NOM_MERE_EPOUSE", "MERE_EPOUSE_DECEDEE", "VEUVE_EPOUSE",
+    "OBSERVATION_MARIAGE", "LIEU_ACTE", "TYPE_ACTE_BRUT", "CODE_INSEE",
+]
+
+TYPE_ACTE_IDX = 24  # index de TYPE_ACTE dans la ligne SQL parsée (0-based)
+
+# Indices SQL des noms de parents (pour détecter le '+')
+IDX_PERE_EPOUX  = 10
+IDX_MERE_EPOUX  = 11
+IDX_PERE_EPOUSE = 19
+IDX_MERE_EPOUSE = 20
+
+# Mapping catégorie → nom de fichier
+OUTPUT_FILES = {
+    "BMS":          "mariages_religieux.csv",
+    "CM":           "contrats_de_mariages.csv",
+    "TABLES":       "tables_de_mariages.csv",
+    "ETAT_CIVIL":   "etat_civil.csv",
+}
+
+
+def classify(type_acte: str) -> str | None:
+    # Normaliser les espaces multiples pour la classification
+    import re as _re
+    ta = _re.sub(r'\s+', ' ', type_acte.strip()).upper()
+    if ta.startswith("BMS"):
+        return "BMS"
+    if ta.startswith("ETAT CIVIL") or ta.startswith("ETAT CILVIL"):
+        return "ETAT_CIVIL"
+    if ta.startswith("TABLE CM") or ta.startswith("TABLE C"):
+        return "CM"   # fusionné avec Contrats de Mariages
+    if ta.startswith("TABLE"):
+        return "TABLES"  # TABLE MARIAGES, TABLE PUBLICATIONS DE MARIAGES, etc.
+    if ta.startswith("CM"):
+        return "CM"
+    return None
+
+
+def parse_mysql_value(src: str, pos: int) -> tuple[str, int]:
+    """Parse a single MySQL value starting at pos; return (value_str, next_pos)."""
+    # skip whitespace
+    while pos < len(src) and src[pos] in (' ', '\t'):
+        pos += 1
+
+    if pos >= len(src):
+        return ("", pos)
+
+    if src[pos] == "'":
+        # quoted string
+        pos += 1
+        buf = []
+        while pos < len(src):
+            ch = src[pos]
+            if ch == '\\' and pos + 1 < len(src):
+                nxt = src[pos + 1]
+                if nxt == "'":
+                    buf.append("'")
+                elif nxt == '\\':
+                    buf.append('\\')
+                elif nxt == 'n':
+                    buf.append('\n')
+                elif nxt == 'r':
+                    buf.append('\r')
+                elif nxt == 't':
+                    buf.append('\t')
+                else:
+                    buf.append(nxt)
+                pos += 2
+            elif ch == "'":
+                pos += 1
+                break
+            else:
+                buf.append(ch)
+                pos += 1
+        return (''.join(buf), pos)
+
+    if src[pos:pos + 4] == 'NULL':
+        return ('', pos + 4)
+
+    # integer (possibly negative)
+    j = pos
+    if j < len(src) and src[j] == '-':
+        j += 1
+    while j < len(src) and src[j].isdigit():
+        j += 1
+    return (src[pos:j], j)
+
+
+def parse_row(line: str) -> list[str] | None:
+    """Parse one INSERT tuple line into a list of string values, or None if not a data row."""
+    line = line.strip()
+    if not line.startswith('('):
+        return None
+    # strip trailing ),  or );
+    if line.endswith(');'):
+        inner = line[1:-2]
+    elif line.endswith('),'):
+        inner = line[1:-2]
+    elif line.endswith(')'):
+        inner = line[1:-1]
+    else:
+        return None
+
+    values = []
+    pos = 0
+    while pos < len(inner):
+        # skip whitespace and commas between values
+        while pos < len(inner) and inner[pos] in (' ', '\t'):
+            pos += 1
+        if pos >= len(inner):
+            break
+        if inner[pos] == ',':
+            pos += 1
+            continue
+        val, pos = parse_mysql_value(inner, pos)
+        values.append(val)
+
+    return values if len(values) == len(SQL_COLUMNS) else None
+
+
+def deceased_flag(name: str) -> str:
+    """Renvoie '1' si le nom se termine par '+', '' sinon."""
+    return "1" if name.strip().endswith("+") else ""
+
+
+def enrich_row(sql_row: list[str]) -> list[str]:
+    """Insère les 4 colonnes dérivées dans la liste SQL (26 → 30 colonnes)."""
+    r = sql_row
+    return [
+        # Époux (indices SQL 0-9 inchangés)
+        r[0], r[1], r[2], r[3],
+        r[4], r[5], r[6], r[7], r[8], r[9],
+        # Parents époux + flags décès
+        r[IDX_PERE_EPOUX],  deceased_flag(r[IDX_PERE_EPOUX]),
+        r[IDX_MERE_EPOUX],  deceased_flag(r[IDX_MERE_EPOUX]),
+        r[12],  # VEUF_EPOUX
+        # Épouse (SQL 13-18)
+        r[13], r[14], r[15], r[16], r[17], r[18],
+        # Parents épouse + flags décès
+        r[IDX_PERE_EPOUSE], deceased_flag(r[IDX_PERE_EPOUSE]),
+        r[IDX_MERE_EPOUSE], deceased_flag(r[IDX_MERE_EPOUSE]),
+        r[21],  # VEUVE_EPOUSE
+        # Reste (SQL 22-25)
+        r[22], r[23], r[24], r[25],
+    ]
+
+
+def main():
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+
+    file_size = os.path.getsize(SQL_FILE)
+
+    # Ouvrir les 4 CSV en sortie (UTF-8 avec BOM pour compatibilité Excel)
+    handles = {}
+    writers = {}
+    for key, fname in OUTPUT_FILES.items():
+        path = os.path.join(OUTPUT_DIR, fname)
+        fh = open(path, 'w', encoding='utf-8-sig', newline='')
+        writer = csv.writer(fh, quoting=csv.QUOTE_ALL)
+        writer.writerow(CSV_COLUMNS)
+        handles[key] = fh
+        writers[key] = writer
+
+    counts = {k: 0 for k in OUTPUT_FILES}
+    skipped = 0
+    anomalies = []
+
+    start = time.time()
+    bytes_read = 0
+    in_mariage_insert = False
+    last_pct = -1
+
+    print(f"Lecture de {SQL_FILE} ({file_size / 1e9:.2f} Go)…")
+
+    with open(SQL_FILE, encoding='utf-8', errors='replace') as f:
+        for line in f:
+            bytes_read += len(line.encode('utf-8', errors='replace'))
+
+            # Affichage de la progression tous les 5 %
+            pct = int(bytes_read * 100 / file_size)
+            if pct != last_pct and pct % 5 == 0:
+                elapsed = time.time() - start
+                total_est = int(elapsed / max(bytes_read, 1) * file_size)
+                remaining = max(0, total_est - int(elapsed))
+                print(f"  {pct:3d}%  ({bytes_read / 1e9:.2f} Go)  "
+                      f"~{remaining // 60}m{remaining % 60:02d}s restant", flush=True)
+                last_pct = pct
+
+            # Détection des blocs INSERT de la table mariage
+            if 'INSERT INTO `mariage`' in line:
+                in_mariage_insert = True
+                continue
+
+            if not in_mariage_insert:
+                continue
+
+            # Fin du bloc INSERT
+            if line.strip() == '' or line.strip().startswith('--') or line.strip().startswith('/*!'):
+                if not line.strip().startswith('('):
+                    in_mariage_insert = False
+                    continue
+
+            row = parse_row(line)
+
+            if row is None:
+                if line.strip().startswith('('):
+                    skipped += 1
+                    anomalies.append(line.strip()[:120])
+                continue
+
+            type_acte = row[TYPE_ACTE_IDX]
+            cat = classify(type_acte)
+
+            if cat is None:
+                skipped += 1
+                anomalies.append(f"TYPE_ACTE inconnu: {repr(type_acte)}")
+                continue
+
+            writers[cat].writerow(enrich_row(row))
+            counts[cat] += 1
+
+            # Fin de bloc (dernière ligne se termine par ';')
+            if line.rstrip().endswith(';'):
+                in_mariage_insert = False
+
+    for fh in handles.values():
+        fh.close()
+
+    elapsed = int(time.time() - start)
+    total = sum(counts.values())
+
+    print(f"\n{'='*55}")
+    print(f"Export terminé en {elapsed // 60}m{elapsed % 60:02d}s")
+    print(f"{'='*55}")
+    for key, fname in OUTPUT_FILES.items():
+        path = os.path.join(OUTPUT_DIR, fname)
+        size_kb = os.path.getsize(path) // 1024
+        print(f"  {fname:<35s} {counts[key]:>7,} lignes  ({size_kb:>6,} Ko)")
+    print(f"  {'TOTAL':<35s} {total:>7,} lignes")
+    if skipped:
+        print(f"\n  ⚠  {skipped} enregistrement(s) ignoré(s) :")
+        for a in anomalies[:10]:
+            print(f"       {a}")
+        if len(anomalies) > 10:
+            print(f"       … et {len(anomalies) - 10} autres")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+"""Merge multiple GEDCOM files into one, renumbering INDI and FAM IDs to avoid collisions."""
+
+import glob
+import os
+import re
+import sys
+
+INPUT_DIR = "gedcom_output"
+OUTPUT_FILE = "lignees.ged"
+
+def parse_gedcom(path):
+    """Return (header_note, records) where records is a list of raw record strings."""
+    with open(path, encoding="utf-8") as f:
+        content = f.read()
+
+    lines = content.splitlines()
+    records = []
+    current = []
+    header_note = ""
+
+    for line in lines:
+        if line.startswith("0 "):
+            if current:
+                tag = current[0]
+                if tag == "0 HEAD":
+                    # Extract the NOTE line for source tracking
+                    for l in current:
+                        if l.startswith("1 NOTE "):
+                            header_note = l[7:]
+                elif tag != "0 TRLR":
+                    records.append("\n".join(current))
+            current = [line]
+        else:
+            current.append(line)
+
+    if current and current[0] not in ("0 TRLR", "0 HEAD"):
+        records.append("\n".join(current))
+
+    return header_note, records
+
+
+def renumber_records(records, indi_offset, fam_offset):
+    """Replace @Innnnn@ and @Fnnnnn@ references with offset-adjusted IDs."""
+    def replace_id(m):
+        kind = m.group(1)
+        num = int(m.group(2))
+        if kind == "I":
+            return f"@I{num + indi_offset:04d}@"
+        else:
+            return f"@F{num + fam_offset:04d}@"
+
+    result = []
+    for record in records:
+        renumbered = re.sub(r"@([IF])(\d+)@", replace_id, record)
+        result.append(renumbered)
+    return result
+
+
+def main():
+    ged_files = sorted(glob.glob(os.path.join(INPUT_DIR, "*.ged")))
+    if not ged_files:
+        print(f"No .ged files found in {INPUT_DIR}/", file=sys.stderr)
+        sys.exit(1)
+
+    print(f"Found {len(ged_files)} GEDCOM files to merge.")
+
+    all_records = []
+    indi_offset = 0
+    fam_offset = 0
+    sources = []
+
+    for path in ged_files:
+        basename = os.path.basename(path)
+        note, records = parse_gedcom(path)
+        sources.append(note or basename)
+
+        indi_count = sum(1 for r in records if r.startswith("0 @I"))
+        fam_count = sum(1 for r in records if r.startswith("0 @F"))
+
+        renumbered = renumber_records(records, indi_offset, fam_offset)
+        all_records.extend(renumbered)
+
+        print(f"  {basename}: {indi_count} INDI, {fam_count} FAM  (offset I+{indi_offset}, F+{fam_offset})")
+        indi_offset += indi_count
+        fam_offset += fam_count
+
+    total_indi = sum(1 for r in all_records if r.startswith("0 @I"))
+    total_fam = sum(1 for r in all_records if r.startswith("0 @F"))
+    print(f"\nTotal: {total_indi} INDI, {total_fam} FAM → {OUTPUT_FILE}")
+
+    with open(OUTPUT_FILE, "w", encoding="utf-8") as out:
+        out.write("0 HEAD\n")
+        out.write("1 SOUR BaseCGL\n")
+        out.write("2 NAME CGL Bases généalogiques du Languedoc – basesgen.sql\n")
+        out.write("1 GEDC\n")
+        out.write("2 VERS 5.5.1\n")
+        out.write("2 FORM LINEAGE-LINKED\n")
+        out.write("1 CHAR UTF-8\n")
+        out.write("1 NOTE Lignées CGL – fusion de 16 exports GEDCOM\n")
+        for src in sources:
+            out.write(f"2 CONT {src}\n")
+        out.write("\n")
+        for record in all_records:
+            out.write(record)
+            out.write("\n\n")
+        out.write("0 TRLR\n")
+
+    print("Done.")
+
+
+if __name__ == "__main__":
+    main()