Add CSV export of mariage table and merged GEDCOM

- export_mariages_csv.py : extrait la table `mariage` depuis basesgen.sql
  en 4 fichiers CSV classés par type d'acte (BMS, CM+TABLE CM,
  TABLE MARIAGES, ETAT CIVIL) ; ajoute 4 colonnes dérivées
  PERE/MERE_EPOUX/EPOUSE_DECEDE(E) détectées via le suffixe '+' sur les
  noms de parents ; encodage UTF-8 (corrige la lecture latin-1 initiale
  qui produisait des artefacts type "Félix")
- csv_export/ : 4 fichiers générés (380 892 enregistrements au total)
- merge_gedcom.py : fusionne les GEDCOM individuels en renumérotant les
  identifiants INDI/FAM pour éviter les collisions
- lignees.ged : fusion des 16 exports (4 299 individus, 1 484 familles)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-28 15:22:28 +02:00
parent f75cbebb44
commit e467b9662a
7 changed files with 435895 additions and 0 deletions
+113
View File
@@ -0,0 +1,113 @@
#!/usr/bin/env python3
"""Merge multiple GEDCOM files into one, renumbering INDI and FAM IDs to avoid collisions."""
import glob
import os
import re
import sys
INPUT_DIR = "gedcom_output"
OUTPUT_FILE = "lignees.ged"
def parse_gedcom(path):
"""Return (header_note, records) where records is a list of raw record strings."""
with open(path, encoding="utf-8") as f:
content = f.read()
lines = content.splitlines()
records = []
current = []
header_note = ""
for line in lines:
if line.startswith("0 "):
if current:
tag = current[0]
if tag == "0 HEAD":
# Extract the NOTE line for source tracking
for l in current:
if l.startswith("1 NOTE "):
header_note = l[7:]
elif tag != "0 TRLR":
records.append("\n".join(current))
current = [line]
else:
current.append(line)
if current and current[0] not in ("0 TRLR", "0 HEAD"):
records.append("\n".join(current))
return header_note, records
def renumber_records(records, indi_offset, fam_offset):
"""Replace @Innnnn@ and @Fnnnnn@ references with offset-adjusted IDs."""
def replace_id(m):
kind = m.group(1)
num = int(m.group(2))
if kind == "I":
return f"@I{num + indi_offset:04d}@"
else:
return f"@F{num + fam_offset:04d}@"
result = []
for record in records:
renumbered = re.sub(r"@([IF])(\d+)@", replace_id, record)
result.append(renumbered)
return result
def main():
ged_files = sorted(glob.glob(os.path.join(INPUT_DIR, "*.ged")))
if not ged_files:
print(f"No .ged files found in {INPUT_DIR}/", file=sys.stderr)
sys.exit(1)
print(f"Found {len(ged_files)} GEDCOM files to merge.")
all_records = []
indi_offset = 0
fam_offset = 0
sources = []
for path in ged_files:
basename = os.path.basename(path)
note, records = parse_gedcom(path)
sources.append(note or basename)
indi_count = sum(1 for r in records if r.startswith("0 @I"))
fam_count = sum(1 for r in records if r.startswith("0 @F"))
renumbered = renumber_records(records, indi_offset, fam_offset)
all_records.extend(renumbered)
print(f" {basename}: {indi_count} INDI, {fam_count} FAM (offset I+{indi_offset}, F+{fam_offset})")
indi_offset += indi_count
fam_offset += fam_count
total_indi = sum(1 for r in all_records if r.startswith("0 @I"))
total_fam = sum(1 for r in all_records if r.startswith("0 @F"))
print(f"\nTotal: {total_indi} INDI, {total_fam} FAM → {OUTPUT_FILE}")
with open(OUTPUT_FILE, "w", encoding="utf-8") as out:
out.write("0 HEAD\n")
out.write("1 SOUR BaseCGL\n")
out.write("2 NAME CGL Bases généalogiques du Languedoc basesgen.sql\n")
out.write("1 GEDC\n")
out.write("2 VERS 5.5.1\n")
out.write("2 FORM LINEAGE-LINKED\n")
out.write("1 CHAR UTF-8\n")
out.write("1 NOTE Lignées CGL fusion de 16 exports GEDCOM\n")
for src in sources:
out.write(f"2 CONT {src}\n")
out.write("\n")
for record in all_records:
out.write(record)
out.write("\n\n")
out.write("0 TRLR\n")
print("Done.")
if __name__ == "__main__":
main()