e467b9662a
- export_mariages_csv.py : extrait la table `mariage` depuis basesgen.sql en 4 fichiers CSV classés par type d'acte (BMS, CM+TABLE CM, TABLE MARIAGES, ETAT CIVIL) ; ajoute 4 colonnes dérivées PERE/MERE_EPOUX/EPOUSE_DECEDE(E) détectées via le suffixe '+' sur les noms de parents ; encodage UTF-8 (corrige la lecture latin-1 initiale qui produisait des artefacts type "Félix") - csv_export/ : 4 fichiers générés (380 892 enregistrements au total) - merge_gedcom.py : fusionne les GEDCOM individuels en renumérotant les identifiants INDI/FAM pour éviter les collisions - lignees.ged : fusion des 16 exports (4 299 individus, 1 484 familles) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
114 lines
3.4 KiB
Python
114 lines
3.4 KiB
Python
#!/usr/bin/env python3
|
||
"""Merge multiple GEDCOM files into one, renumbering INDI and FAM IDs to avoid collisions."""
|
||
|
||
import glob
|
||
import os
|
||
import re
|
||
import sys
|
||
|
||
INPUT_DIR = "gedcom_output"
|
||
OUTPUT_FILE = "lignees.ged"
|
||
|
||
def parse_gedcom(path):
|
||
"""Return (header_note, records) where records is a list of raw record strings."""
|
||
with open(path, encoding="utf-8") as f:
|
||
content = f.read()
|
||
|
||
lines = content.splitlines()
|
||
records = []
|
||
current = []
|
||
header_note = ""
|
||
|
||
for line in lines:
|
||
if line.startswith("0 "):
|
||
if current:
|
||
tag = current[0]
|
||
if tag == "0 HEAD":
|
||
# Extract the NOTE line for source tracking
|
||
for l in current:
|
||
if l.startswith("1 NOTE "):
|
||
header_note = l[7:]
|
||
elif tag != "0 TRLR":
|
||
records.append("\n".join(current))
|
||
current = [line]
|
||
else:
|
||
current.append(line)
|
||
|
||
if current and current[0] not in ("0 TRLR", "0 HEAD"):
|
||
records.append("\n".join(current))
|
||
|
||
return header_note, records
|
||
|
||
|
||
def renumber_records(records, indi_offset, fam_offset):
|
||
"""Replace @Innnnn@ and @Fnnnnn@ references with offset-adjusted IDs."""
|
||
def replace_id(m):
|
||
kind = m.group(1)
|
||
num = int(m.group(2))
|
||
if kind == "I":
|
||
return f"@I{num + indi_offset:04d}@"
|
||
else:
|
||
return f"@F{num + fam_offset:04d}@"
|
||
|
||
result = []
|
||
for record in records:
|
||
renumbered = re.sub(r"@([IF])(\d+)@", replace_id, record)
|
||
result.append(renumbered)
|
||
return result
|
||
|
||
|
||
def main():
|
||
ged_files = sorted(glob.glob(os.path.join(INPUT_DIR, "*.ged")))
|
||
if not ged_files:
|
||
print(f"No .ged files found in {INPUT_DIR}/", file=sys.stderr)
|
||
sys.exit(1)
|
||
|
||
print(f"Found {len(ged_files)} GEDCOM files to merge.")
|
||
|
||
all_records = []
|
||
indi_offset = 0
|
||
fam_offset = 0
|
||
sources = []
|
||
|
||
for path in ged_files:
|
||
basename = os.path.basename(path)
|
||
note, records = parse_gedcom(path)
|
||
sources.append(note or basename)
|
||
|
||
indi_count = sum(1 for r in records if r.startswith("0 @I"))
|
||
fam_count = sum(1 for r in records if r.startswith("0 @F"))
|
||
|
||
renumbered = renumber_records(records, indi_offset, fam_offset)
|
||
all_records.extend(renumbered)
|
||
|
||
print(f" {basename}: {indi_count} INDI, {fam_count} FAM (offset I+{indi_offset}, F+{fam_offset})")
|
||
indi_offset += indi_count
|
||
fam_offset += fam_count
|
||
|
||
total_indi = sum(1 for r in all_records if r.startswith("0 @I"))
|
||
total_fam = sum(1 for r in all_records if r.startswith("0 @F"))
|
||
print(f"\nTotal: {total_indi} INDI, {total_fam} FAM → {OUTPUT_FILE}")
|
||
|
||
with open(OUTPUT_FILE, "w", encoding="utf-8") as out:
|
||
out.write("0 HEAD\n")
|
||
out.write("1 SOUR BaseCGL\n")
|
||
out.write("2 NAME CGL Bases généalogiques du Languedoc – basesgen.sql\n")
|
||
out.write("1 GEDC\n")
|
||
out.write("2 VERS 5.5.1\n")
|
||
out.write("2 FORM LINEAGE-LINKED\n")
|
||
out.write("1 CHAR UTF-8\n")
|
||
out.write("1 NOTE Lignées CGL – fusion de 16 exports GEDCOM\n")
|
||
for src in sources:
|
||
out.write(f"2 CONT {src}\n")
|
||
out.write("\n")
|
||
for record in all_records:
|
||
out.write(record)
|
||
out.write("\n\n")
|
||
out.write("0 TRLR\n")
|
||
|
||
print("Done.")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|