BasesCGL/merge_gedcom.py

#!/usr/bin/env python3
"""Merge multiple GEDCOM files into one, renumbering INDI and FAM IDs to avoid collisions."""

import glob
import os
import re
import sys

INPUT_DIR = "gedcom_output"
OUTPUT_FILE = "lignees.ged"

def parse_gedcom(path):
    """Return (header_note, records) where records is a list of raw record strings."""
    with open(path, encoding="utf-8") as f:
        content = f.read()

    lines = content.splitlines()
    records = []
    current = []
    header_note = ""

    for line in lines:
        if line.startswith("0 "):
            if current:
                tag = current[0]
                if tag == "0 HEAD":
                    # Extract the NOTE line for source tracking
                    for l in current:
                        if l.startswith("1 NOTE "):
                            header_note = l[7:]
                elif tag != "0 TRLR":
                    records.append("\n".join(current))
            current = [line]
        else:
            current.append(line)

    if current and current[0] not in ("0 TRLR", "0 HEAD"):
        records.append("\n".join(current))

    return header_note, records


def renumber_records(records, indi_offset, fam_offset):
    """Replace @Innnnn@ and @Fnnnnn@ references with offset-adjusted IDs."""
    def replace_id(m):
        kind = m.group(1)
        num = int(m.group(2))
        if kind == "I":
            return f"@I{num + indi_offset:04d}@"
        else:
            return f"@F{num + fam_offset:04d}@"

    result = []
    for record in records:
        renumbered = re.sub(r"@([IF])(\d+)@", replace_id, record)
        result.append(renumbered)
    return result


def main():
    ged_files = sorted(glob.glob(os.path.join(INPUT_DIR, "*.ged")))
    if not ged_files:
        print(f"No .ged files found in {INPUT_DIR}/", file=sys.stderr)
        sys.exit(1)

    print(f"Found {len(ged_files)} GEDCOM files to merge.")

    all_records = []
    indi_offset = 0
    fam_offset = 0
    sources = []

    for path in ged_files:
        basename = os.path.basename(path)
        note, records = parse_gedcom(path)
        sources.append(note or basename)

        indi_count = sum(1 for r in records if r.startswith("0 @I"))
        fam_count = sum(1 for r in records if r.startswith("0 @F"))

        renumbered = renumber_records(records, indi_offset, fam_offset)
        all_records.extend(renumbered)

        print(f"  {basename}: {indi_count} INDI, {fam_count} FAM  (offset I+{indi_offset}, F+{fam_offset})")
        indi_offset += indi_count
        fam_offset += fam_count

    total_indi = sum(1 for r in all_records if r.startswith("0 @I"))
    total_fam = sum(1 for r in all_records if r.startswith("0 @F"))
    print(f"\nTotal: {total_indi} INDI, {total_fam} FAM → {OUTPUT_FILE}")

    with open(OUTPUT_FILE, "w", encoding="utf-8") as out:
        out.write("0 HEAD\n")
        out.write("1 SOUR BaseCGL\n")
        out.write("2 NAME CGL Bases généalogiques du Languedoc – basesgen.sql\n")
        out.write("1 GEDC\n")
        out.write("2 VERS 5.5.1\n")
        out.write("2 FORM LINEAGE-LINKED\n")
        out.write("1 CHAR UTF-8\n")
        out.write("1 NOTE Lignées CGL – fusion de 16 exports GEDCOM\n")
        for src in sources:
            out.write(f"2 CONT {src}\n")
        out.write("\n")
        for record in all_records:
            out.write(record)
            out.write("\n\n")
        out.write("0 TRLR\n")

    print("Done.")


if __name__ == "__main__":
    main()