Files
BasesCGL/merge_gedcom.py
T
yann64 e467b9662a Add CSV export of mariage table and merged GEDCOM
- export_mariages_csv.py : extrait la table `mariage` depuis basesgen.sql
  en 4 fichiers CSV classés par type d'acte (BMS, CM+TABLE CM,
  TABLE MARIAGES, ETAT CIVIL) ; ajoute 4 colonnes dérivées
  PERE/MERE_EPOUX/EPOUSE_DECEDE(E) détectées via le suffixe '+' sur les
  noms de parents ; encodage UTF-8 (corrige la lecture latin-1 initiale
  qui produisait des artefacts type "Félix")
- csv_export/ : 4 fichiers générés (380 892 enregistrements au total)
- merge_gedcom.py : fusionne les GEDCOM individuels en renumérotant les
  identifiants INDI/FAM pour éviter les collisions
- lignees.ged : fusion des 16 exports (4 299 individus, 1 484 familles)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-28 15:22:28 +02:00

114 lines
3.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""Merge multiple GEDCOM files into one, renumbering INDI and FAM IDs to avoid collisions."""
import glob
import os
import re
import sys
INPUT_DIR = "gedcom_output"
OUTPUT_FILE = "lignees.ged"
def parse_gedcom(path):
"""Return (header_note, records) where records is a list of raw record strings."""
with open(path, encoding="utf-8") as f:
content = f.read()
lines = content.splitlines()
records = []
current = []
header_note = ""
for line in lines:
if line.startswith("0 "):
if current:
tag = current[0]
if tag == "0 HEAD":
# Extract the NOTE line for source tracking
for l in current:
if l.startswith("1 NOTE "):
header_note = l[7:]
elif tag != "0 TRLR":
records.append("\n".join(current))
current = [line]
else:
current.append(line)
if current and current[0] not in ("0 TRLR", "0 HEAD"):
records.append("\n".join(current))
return header_note, records
def renumber_records(records, indi_offset, fam_offset):
"""Replace @Innnnn@ and @Fnnnnn@ references with offset-adjusted IDs."""
def replace_id(m):
kind = m.group(1)
num = int(m.group(2))
if kind == "I":
return f"@I{num + indi_offset:04d}@"
else:
return f"@F{num + fam_offset:04d}@"
result = []
for record in records:
renumbered = re.sub(r"@([IF])(\d+)@", replace_id, record)
result.append(renumbered)
return result
def main():
ged_files = sorted(glob.glob(os.path.join(INPUT_DIR, "*.ged")))
if not ged_files:
print(f"No .ged files found in {INPUT_DIR}/", file=sys.stderr)
sys.exit(1)
print(f"Found {len(ged_files)} GEDCOM files to merge.")
all_records = []
indi_offset = 0
fam_offset = 0
sources = []
for path in ged_files:
basename = os.path.basename(path)
note, records = parse_gedcom(path)
sources.append(note or basename)
indi_count = sum(1 for r in records if r.startswith("0 @I"))
fam_count = sum(1 for r in records if r.startswith("0 @F"))
renumbered = renumber_records(records, indi_offset, fam_offset)
all_records.extend(renumbered)
print(f" {basename}: {indi_count} INDI, {fam_count} FAM (offset I+{indi_offset}, F+{fam_offset})")
indi_offset += indi_count
fam_offset += fam_count
total_indi = sum(1 for r in all_records if r.startswith("0 @I"))
total_fam = sum(1 for r in all_records if r.startswith("0 @F"))
print(f"\nTotal: {total_indi} INDI, {total_fam} FAM → {OUTPUT_FILE}")
with open(OUTPUT_FILE, "w", encoding="utf-8") as out:
out.write("0 HEAD\n")
out.write("1 SOUR BaseCGL\n")
out.write("2 NAME CGL Bases généalogiques du Languedoc basesgen.sql\n")
out.write("1 GEDC\n")
out.write("2 VERS 5.5.1\n")
out.write("2 FORM LINEAGE-LINKED\n")
out.write("1 CHAR UTF-8\n")
out.write("1 NOTE Lignées CGL fusion de 16 exports GEDCOM\n")
for src in sources:
out.write(f"2 CONT {src}\n")
out.write("\n")
for record in all_records:
out.write(record)
out.write("\n\n")
out.write("0 TRLR\n")
print("Done.")
if __name__ == "__main__":
main()