f75cbebb44
Includes export_lignees_to_gedcom.py (Drupal book → GEDCOM 5.5.1), export_users_to_webtrees.py, generated GEDCOM files for 16 family lineages, and webtrees user import SQL. Excludes basesgen.sql (966 MB) and webtrees_temp_passwords.csv (sensitive). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1691 lines
61 KiB
Python
1691 lines
61 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Drupal 6 'lignées familiales' → GEDCOM 5.5.1 exporter.
|
||
|
||
Reads basesgen.sql (drupal_node + drupal_node_revisions, type='book')
|
||
and writes one .ged file per family under ./gedcom_output/.
|
||
|
||
Persons are identified by their hierarchical ID (e.g. "1.4.4b.1"):
|
||
- dots separate parent→child relationships
|
||
- a letter suffix on the last component (a/b/c…) identifies which
|
||
union of the parent produced this child
|
||
|
||
Usage:
|
||
python3 export_lignees_to_gedcom.py
|
||
"""
|
||
|
||
import re
|
||
import sys
|
||
import html
|
||
from pathlib import Path
|
||
from bs4 import BeautifulSoup
|
||
|
||
# ── Configuration ─────────────────────────────────────────────────────────────
|
||
|
||
SQL_FILE = Path("/home/yann64/BaseCGL/basesgen.sql")
|
||
OUT_DIR = Path("/home/yann64/BaseCGL/gedcom_output")
|
||
SOURCE_STR = "CGL Bases généalogiques du Languedoc – basesgen.sql"
|
||
|
||
# ── French place hierarchy ────────────────────────────────────────────────────
|
||
|
||
# Maps département name → région name (post-2016 reform, mainland + DOM).
|
||
# Keys are title-cased; lookup is case-insensitive (see _expand_place).
|
||
_DEPT_TO_REGION: dict[str, str] = {
|
||
# Auvergne-Rhône-Alpes
|
||
"Ain": "Auvergne-Rhône-Alpes",
|
||
"Allier": "Auvergne-Rhône-Alpes",
|
||
"Ardèche": "Auvergne-Rhône-Alpes",
|
||
"Cantal": "Auvergne-Rhône-Alpes",
|
||
"Drôme": "Auvergne-Rhône-Alpes",
|
||
"Isère": "Auvergne-Rhône-Alpes",
|
||
"Loire": "Auvergne-Rhône-Alpes",
|
||
"Haute-Loire": "Auvergne-Rhône-Alpes",
|
||
"Puy-de-Dôme": "Auvergne-Rhône-Alpes",
|
||
"Rhône": "Auvergne-Rhône-Alpes",
|
||
"Savoie": "Auvergne-Rhône-Alpes",
|
||
"Haute-Savoie": "Auvergne-Rhône-Alpes",
|
||
# Bourgogne-Franche-Comté
|
||
"Côte-d'Or": "Bourgogne-Franche-Comté",
|
||
"Doubs": "Bourgogne-Franche-Comté",
|
||
"Jura": "Bourgogne-Franche-Comté",
|
||
"Nièvre": "Bourgogne-Franche-Comté",
|
||
"Haute-Saône": "Bourgogne-Franche-Comté",
|
||
"Saône-et-Loire": "Bourgogne-Franche-Comté",
|
||
"Yonne": "Bourgogne-Franche-Comté",
|
||
"Territoire de Belfort": "Bourgogne-Franche-Comté",
|
||
# Bretagne
|
||
"Côtes-d'Armor": "Bretagne",
|
||
"Finistère": "Bretagne",
|
||
"Ille-et-Vilaine": "Bretagne",
|
||
"Morbihan": "Bretagne",
|
||
# Centre-Val de Loire
|
||
"Cher": "Centre-Val de Loire",
|
||
"Eure-et-Loir": "Centre-Val de Loire",
|
||
"Indre": "Centre-Val de Loire",
|
||
"Indre-et-Loire": "Centre-Val de Loire",
|
||
"Loir-et-Cher": "Centre-Val de Loire",
|
||
"Loiret": "Centre-Val de Loire",
|
||
# Corse
|
||
"Corse-du-Sud": "Corse",
|
||
"Haute-Corse": "Corse",
|
||
# Grand Est
|
||
"Ardennes": "Grand Est",
|
||
"Aube": "Grand Est",
|
||
"Marne": "Grand Est",
|
||
"Haute-Marne": "Grand Est",
|
||
"Meurthe-et-Moselle": "Grand Est",
|
||
"Meuse": "Grand Est",
|
||
"Moselle": "Grand Est",
|
||
"Bas-Rhin": "Grand Est",
|
||
"Haut-Rhin": "Grand Est",
|
||
"Vosges": "Grand Est",
|
||
# Hauts-de-France
|
||
"Aisne": "Hauts-de-France",
|
||
"Nord": "Hauts-de-France",
|
||
"Oise": "Hauts-de-France",
|
||
"Pas-de-Calais": "Hauts-de-France",
|
||
"Somme": "Hauts-de-France",
|
||
# Île-de-France
|
||
"Paris": "Île-de-France",
|
||
"Ville-de-Paris": "Île-de-France",
|
||
"Seine-et-Marne": "Île-de-France",
|
||
"Yvelines": "Île-de-France",
|
||
"Essonne": "Île-de-France",
|
||
"Hauts-de-Seine": "Île-de-France",
|
||
"Seine-Saint-Denis": "Île-de-France",
|
||
"Val-de-Marne": "Île-de-France",
|
||
"Val-d'Oise": "Île-de-France",
|
||
# Normandie
|
||
"Calvados": "Normandie",
|
||
"Eure": "Normandie",
|
||
"Manche": "Normandie",
|
||
"Orne": "Normandie",
|
||
"Seine-Maritime": "Normandie",
|
||
# Nouvelle-Aquitaine
|
||
"Charente": "Nouvelle-Aquitaine",
|
||
"Charente-Maritime": "Nouvelle-Aquitaine",
|
||
"Corrèze": "Nouvelle-Aquitaine",
|
||
"Creuse": "Nouvelle-Aquitaine",
|
||
"Dordogne": "Nouvelle-Aquitaine",
|
||
"Gironde": "Nouvelle-Aquitaine",
|
||
"Landes": "Nouvelle-Aquitaine",
|
||
"Lot-et-Garonne": "Nouvelle-Aquitaine",
|
||
"Pyrénées-Atlantiques": "Nouvelle-Aquitaine",
|
||
"Deux-Sèvres": "Nouvelle-Aquitaine",
|
||
"Vienne": "Nouvelle-Aquitaine",
|
||
"Haute-Vienne": "Nouvelle-Aquitaine",
|
||
# Occitanie
|
||
"Ariège": "Occitanie",
|
||
"Aude": "Occitanie",
|
||
"Aveyron": "Occitanie",
|
||
"Gard": "Occitanie",
|
||
"Haute-Garonne": "Occitanie",
|
||
"Gers": "Occitanie",
|
||
"Hérault": "Occitanie",
|
||
"Lot": "Occitanie",
|
||
"Lozère": "Occitanie",
|
||
"Hautes-Pyrénées": "Occitanie",
|
||
"Pyrénées-Orientales": "Occitanie",
|
||
"Tarn": "Occitanie",
|
||
"Tarn-et-Garonne": "Occitanie",
|
||
# Pays de la Loire
|
||
"Loire-Atlantique": "Pays de la Loire",
|
||
"Maine-et-Loire": "Pays de la Loire",
|
||
"Mayenne": "Pays de la Loire",
|
||
"Sarthe": "Pays de la Loire",
|
||
"Vendée": "Pays de la Loire",
|
||
# Provence-Alpes-Côte d'Azur
|
||
"Alpes-de-Haute-Provence": "Provence-Alpes-Côte d'Azur",
|
||
"Hautes-Alpes": "Provence-Alpes-Côte d'Azur",
|
||
"Alpes-Maritimes": "Provence-Alpes-Côte d'Azur",
|
||
"Bouches-du-Rhône": "Provence-Alpes-Côte d'Azur",
|
||
"Var": "Provence-Alpes-Côte d'Azur",
|
||
"Vaucluse": "Provence-Alpes-Côte d'Azur",
|
||
# DOM
|
||
"Guadeloupe": "Guadeloupe",
|
||
"Martinique": "Martinique",
|
||
"Guyane": "Guyane",
|
||
"La Réunion": "La Réunion",
|
||
"Mayotte": "Mayotte",
|
||
# Historical names (pre-reform or pre-1969)
|
||
"Basses-Pyrénées": "Nouvelle-Aquitaine", # now Pyrénées-Atlantiques
|
||
"Basses Pyrénées": "Nouvelle-Aquitaine",
|
||
"Seine": "Île-de-France", # dissolved in 1968
|
||
"Seine-et-Oise": "Île-de-France", # dissolved in 1968
|
||
}
|
||
# Case-insensitive lookup index
|
||
_DEPT_LOWER: dict[str, str] = {k.lower(): v for k, v in _DEPT_TO_REGION.items()}
|
||
# Canonical name index (for display — preserves original casing)
|
||
_DEPT_CANONICAL: dict[str, str] = {k.lower(): k for k in _DEPT_TO_REGION}
|
||
|
||
|
||
# Matches a time prefix with a city following: "12 h 00 à City" / "15h au Mas"
|
||
_RE_TIME_WITH_CITY = re.compile(
|
||
r"^(\d+)\s*h(?:\s*(\d+))?\s+(?:aux?|[àa])\s+(.+)$", re.I)
|
||
# Matches a pure time string with no city
|
||
_RE_TIME_ONLY = re.compile(r"^(\d+)\s*h(?:\s*(\d+))?\s*$", re.I)
|
||
|
||
|
||
def _split_place(raw: str) -> tuple[str, str]:
|
||
"""
|
||
Split a raw place string into (gedcom_time, place_without_prefix).
|
||
|
||
Returns:
|
||
- ('HH:MM', 'City (Dept)') for '12 h 00 à City (Dept)'
|
||
- ('HH:MM', '') for '2 h 30' (pure time, no city)
|
||
- ('', raw) when no time prefix is found
|
||
"""
|
||
if not raw:
|
||
return "", ""
|
||
m = _RE_TIME_WITH_CITY.match(raw)
|
||
if m:
|
||
hours, minutes = int(m.group(1)), int(m.group(2) or 0)
|
||
return f"{hours:02d}:{minutes:02d}", m.group(3).strip()
|
||
m = _RE_TIME_ONLY.match(raw)
|
||
if m:
|
||
hours, minutes = int(m.group(1)), int(m.group(2) or 0)
|
||
return f"{hours:02d}:{minutes:02d}", ""
|
||
return "", raw
|
||
|
||
|
||
def _expand_place(place: str) -> str:
|
||
"""
|
||
Convert 'Montpellier (Hérault)' → 'Montpellier, Hérault, Occitanie, France'.
|
||
'Camarade (Ariège) - Machicot' → 'Machicot, Camarade, Ariège, Occitanie, France'.
|
||
Call _split_place() first to strip any time prefix before passing here.
|
||
"""
|
||
if not place:
|
||
return ""
|
||
m = re.search(r"^(.*?)\s*\(([^)]+)\)\s*(?:-\s*(.+))?$", place)
|
||
if not m:
|
||
return place # no parenthetical département — return as-is
|
||
city = m.group(1).strip()
|
||
dept_raw = m.group(2).strip()
|
||
subdivision = m.group(3).strip() if m.group(3) else ""
|
||
dept_key = dept_raw.lower()
|
||
region = _DEPT_LOWER.get(dept_key, "")
|
||
dept_display = _DEPT_CANONICAL.get(dept_key, dept_raw)
|
||
parts = []
|
||
if subdivision:
|
||
parts.append(subdivision)
|
||
parts.append(city)
|
||
parts.append(dept_display)
|
||
if region:
|
||
parts.append(region)
|
||
parts.append("France")
|
||
return ", ".join(parts)
|
||
|
||
|
||
# ── Marriage contract / source parsing ───────────────────────────────────────
|
||
|
||
_RE_CONTRACT_SOURCE = re.compile(
|
||
r"^(contrat\s+de\s+mariage\s+.+?)\s+-\s+(.+?)\s+-\s+(.+)$", re.I)
|
||
_RE_FOLIO = re.compile(r"\b(folio\s+\S+(?:\s+\S+)*)\s*$", re.I)
|
||
|
||
|
||
def _parse_contract_source(text: str) -> dict:
|
||
"""
|
||
Parse 'Contrat de Mariage chez Maître X - Dépôt - Cote folio N'.
|
||
Returns {"title", "depot", "caln", "page"} or {}.
|
||
"""
|
||
m = _RE_CONTRACT_SOURCE.match(text.strip())
|
||
if not m:
|
||
return {}
|
||
title = m.group(1).strip()
|
||
depot = m.group(2).strip()
|
||
cote_raw = m.group(3).strip()
|
||
fm = _RE_FOLIO.search(cote_raw)
|
||
if fm:
|
||
page = fm.group(1)
|
||
caln = cote_raw[:fm.start()].strip()
|
||
else:
|
||
page = ""
|
||
caln = cote_raw
|
||
return {"title": title, "depot": depot, "caln": caln, "page": page}
|
||
|
||
|
||
# ── Spouse context parsing ─────────────────────────────────────────────────────
|
||
|
||
_RE_SPOUSE_CONTEXT = re.compile(
|
||
r"s[''`]unit\s+avec\s+(.+?)(?=\.\s+(?:Ce couple|Ils\s+se|Le couple)|\.?\s*$)",
|
||
re.I | re.S)
|
||
_RE_SPOUSE_LIFE = re.compile(
|
||
r"\((~?\d{4})\s*(?:-?>?\s*(~?\d{4}))?\)", re.I)
|
||
|
||
|
||
def _parse_spouse_context(full_text: str) -> dict:
|
||
"""
|
||
Extract name, birth, death, occupation from 's'unit avec ...' sentence.
|
||
Returns {"name", "birth", "death", "occu"}.
|
||
"""
|
||
m = _RE_SPOUSE_CONTEXT.search(full_text)
|
||
if not m:
|
||
return {}
|
||
ctx = m.group(1).strip()
|
||
|
||
# Name: up to first ( or ,
|
||
nm = re.match(r"([^(,]+)", ctx)
|
||
name = nm.group(1).strip() if nm else ctx.split(",")[0].strip()
|
||
|
||
# Dates from parenthetical (birth->death or ~birth)
|
||
birth = death = ""
|
||
dm = _RE_SPOUSE_LIFE.search(ctx)
|
||
if dm:
|
||
b_raw = dm.group(1)
|
||
b_year = b_raw.lstrip("~")
|
||
birth = f"ABT {b_year}" if b_raw.startswith("~") else b_year
|
||
if dm.group(2):
|
||
d_raw = dm.group(2)
|
||
d_year = d_raw.lstrip("~")
|
||
death = f"ABT {d_year}" if d_raw.startswith("~") else d_year
|
||
|
||
# Occupation: text after dates (or after name) before "le fils/la fille/les enfants"
|
||
occu = ""
|
||
after = ctx[dm.end():].strip() if dm else ctx[len(name):].strip()
|
||
after = after.lstrip(",").strip()
|
||
om = re.match(r"([^,(]+?)(?=\s*,\s*(?:le|la|les)\s+(?:fils|fille|enfant)|$)", after, re.I)
|
||
if om:
|
||
candidate = om.group(1).strip().rstrip(".")
|
||
# Only keep genuine occupations — reject parentage/family descriptions
|
||
if (candidate and len(candidate) < 60
|
||
and not re.search(r"\best\b|\bsont\b|\bfille\b|\bfils\b|\benfant\b|\bparents\b", candidate, re.I)):
|
||
occu = candidate
|
||
|
||
return {"name": name, "birth": birth, "death": death, "occu": occu}
|
||
|
||
|
||
# ── Gray paragraph grouping (witnesses, godparents, notes) ────────────────────
|
||
|
||
# Headers that introduce a group of witness lines
|
||
_RE_WITNESS_HDR = re.compile(
|
||
r"^(Présents?|Témoins?|Déclarants?)\s*:?\s*$", re.I)
|
||
_RE_MARR_WITNESS_HDR = re.compile(
|
||
r"^(Témoins?\s+au\s+mariage|Présents?\s+au\s+contrat|"
|
||
r"Présents?\s+à\s+la\s+célébration|Présents?\s+au\s+mariage)\b", re.I)
|
||
_RE_DEAT_WITNESS_HDR = re.compile(
|
||
r"^(Témoins?\s+au\s+décès|Présents?\s+au\s+décès)\b", re.I)
|
||
# Lines that are part of a witness list
|
||
_RE_WITNESS_ITEM = re.compile(r"^[\-–•]\s+\S", re.I)
|
||
# Single-line godparent references
|
||
_RE_GODPARENT = re.compile(r"^(Parrain|Marraine)\s*[:;]?\s+\S", re.I)
|
||
# Archive source references in gray paragraphs
|
||
_RE_GRAY_ARCHIVE = re.compile(
|
||
r"^Archives\s+d[eé]\w*\s+.+?(?:\s+-\s+|\s*:\s*)Registre\b", re.I)
|
||
# Marriage-specific gray notes (contract, publications, dispensation, etc.)
|
||
_RE_GRAY_MARR_NOTE = re.compile(
|
||
r"^(?:"
|
||
r"(?:Date\s+du\s+|(?:Un|Il\s+(?:existe\s+un|a\s+été\s+fait\s+un))\s+)?"
|
||
r"Contrat\s+de\s+[Mm]ariage\b"
|
||
r"|Contrat\s+passé\s+(?:chez|devant|par)\b"
|
||
r"|Accord\s+(?:chez|devant|par)\s+Ma[iî]tre\b"
|
||
r"|Acte\s+respectueux\b"
|
||
r"|L[''']acte\s+de\s+mariage\b"
|
||
r"|Publications?\s+de\s+[Mm]ariage\b"
|
||
r"|Dispense\s+(?:de|au|du)\b"
|
||
r"|La\s+mariée?\s+dit\b"
|
||
r"|Le\s+marié?\s+dit\b"
|
||
r")", re.I)
|
||
|
||
|
||
def _group_gray_notes(all_paras: list[dict]) -> dict:
|
||
"""
|
||
Scan paragraphs in order and group gray ones by event association.
|
||
|
||
Returns {
|
||
"birth_notes": list of multiline strings (for BIRT event NOTE)
|
||
"death_notes": list of multiline strings (for DEAT event NOTE)
|
||
"marriage_notes": list of multiline strings (for MARR event NOTE)
|
||
"general_notes": list of single-line strings (for INDI NOTE)
|
||
}
|
||
"""
|
||
result: dict[str, list] = {
|
||
"birth_notes": [], "death_notes": [],
|
||
"marriage_notes": [], "general_notes": [],
|
||
}
|
||
|
||
# Track which events have been encountered (determines gray-note assignment)
|
||
seen_marr = False
|
||
seen_deat = False
|
||
|
||
current_group: list[str] = [] # lines of the current witness group
|
||
current_key: str = "" # "birth_notes" | "death_notes" | "marriage_notes"
|
||
|
||
def flush():
|
||
nonlocal current_group, current_key
|
||
if current_group and current_key:
|
||
result[current_key].append("\n".join(current_group))
|
||
current_group = []
|
||
current_key = ""
|
||
|
||
for para in all_paras:
|
||
color = para["color"]
|
||
text = para["text"].strip()
|
||
|
||
if not text:
|
||
continue
|
||
|
||
# ── Non-gray paragraphs: update event context and flush ──
|
||
if color != "gray":
|
||
flush()
|
||
if re.search(r"\bse marient\b|contrat de mariage\s+le\b", text, re.I):
|
||
seen_marr = True
|
||
if re.search(r"\best décédé\b|\bmeurt le\b", text, re.I):
|
||
seen_deat = True
|
||
continue
|
||
|
||
# ── Gray paragraph ──
|
||
|
||
# Determine which event key to use for ungrouped gray items
|
||
def current_event_key() -> str:
|
||
if seen_deat:
|
||
return "death_notes"
|
||
if seen_marr:
|
||
return "marriage_notes"
|
||
return "birth_notes"
|
||
|
||
# 1a. Marriage-specific notes (contracts, publications, dispensations…)
|
||
if _RE_GRAY_MARR_NOTE.match(text):
|
||
flush()
|
||
if len(text) > 10:
|
||
result["marriage_notes"].append(text)
|
||
continue
|
||
|
||
# 1b. Archive/registry references → contextual event bucket
|
||
if _RE_GRAY_ARCHIVE.match(text):
|
||
flush()
|
||
if len(text) > 10:
|
||
result[current_event_key()].append(text)
|
||
continue
|
||
|
||
# 2. Explicit death-witness header
|
||
if _RE_DEAT_WITNESS_HDR.match(text):
|
||
flush()
|
||
current_key = "death_notes"
|
||
current_group = [text]
|
||
continue
|
||
|
||
# 3. Explicit marriage-witness header
|
||
if _RE_MARR_WITNESS_HDR.match(text):
|
||
flush()
|
||
current_key = "marriage_notes"
|
||
current_group = [text]
|
||
continue
|
||
|
||
# 4. Generic witness header ("Présents :", "Témoins :", "Déclarants :")
|
||
if _RE_WITNESS_HDR.match(text):
|
||
flush()
|
||
current_key = current_event_key()
|
||
current_group = [text]
|
||
continue
|
||
|
||
# 5. Witness item line (starts with "-") — append to current group or start one
|
||
if _RE_WITNESS_ITEM.match(text):
|
||
if current_key:
|
||
current_group.append(text)
|
||
else:
|
||
# Orphan item (no preceding header) — start implicit group
|
||
current_key = current_event_key()
|
||
current_group = [text]
|
||
continue
|
||
|
||
# 6. Single-line godparent reference → birth note
|
||
if _RE_GODPARENT.match(text):
|
||
flush()
|
||
if len(text) > 5:
|
||
result["birth_notes"].append(text)
|
||
continue
|
||
|
||
# 7. Everything else → flush any group, then route by current event context
|
||
flush()
|
||
if len(text) > 10:
|
||
result[current_event_key()].append(text)
|
||
|
||
flush()
|
||
return result
|
||
|
||
|
||
def _emit_note_block(lines: list[str], base_level: int) -> list[str]:
|
||
"""
|
||
Emit a multiline note as GEDCOM NOTE + CONT lines.
|
||
base_level: level of the NOTE line (2 for event-level, 1 for INDI-level).
|
||
"""
|
||
out = []
|
||
note_lines = lines
|
||
if len(note_lines) == 1:
|
||
out.append(gedcom_line(base_level, "NOTE", note_lines[0][:248]))
|
||
else:
|
||
out.append(gedcom_line(base_level, "NOTE", note_lines[0][:248]))
|
||
for continuation in note_lines[1:]:
|
||
out.append(gedcom_line(base_level + 1, "CONT", continuation[:248]))
|
||
return out
|
||
|
||
|
||
def _emit_grouped_notes(note_strings: list[str], base_level: int) -> list[str]:
|
||
"""Emit a list of note strings (each possibly multiline) as GEDCOM NOTE blocks."""
|
||
out = []
|
||
for ns in note_strings:
|
||
out.extend(_emit_note_block(ns.split("\n"), base_level))
|
||
return out
|
||
|
||
|
||
# ── French calendar helpers ───────────────────────────────────────────────────
|
||
|
||
MOIS_FR = {
|
||
"janvier": "JAN", "février": "FEB", "fevrier": "FEB",
|
||
"mars": "MAR", "avril": "APR", "mai": "MAY", "juin": "JUN",
|
||
"juillet": "JUL", "août": "AUG", "aout": "AUG",
|
||
"septembre": "SEP", "octobre": "OCT", "novembre": "NOV", "décembre": "DEC",
|
||
"decembre": "DEC",
|
||
}
|
||
MOIS_PAT = "|".join(MOIS_FR.keys())
|
||
JOURS_PAT = "lundi|mardi|mercredi|jeudi|vendredi|samedi|dimanche"
|
||
|
||
|
||
def fr_date(day: str | None, month_fr: str | None, year: str | None,
|
||
prefix: str = "") -> str:
|
||
"""Convert French date parts to GEDCOM date string."""
|
||
parts = []
|
||
if prefix:
|
||
parts.append(prefix)
|
||
if day and day not in ("0", ""):
|
||
parts.append(str(int(day)))
|
||
if month_fr:
|
||
m = MOIS_FR.get(month_fr.lower().strip())
|
||
if m:
|
||
parts.append(m)
|
||
if year:
|
||
parts.append(year)
|
||
return " ".join(parts) if parts else ""
|
||
|
||
|
||
# ── Regex patterns ────────────────────────────────────────────────────────────
|
||
|
||
_D = rf"(?:(?:{JOURS_PAT})\s+)?(\d+)(?:er|ème|eme|e)?\s+({MOIS_PAT})\s+(\d{{4}})"
|
||
|
||
# Birth
|
||
RE_BORN_FULL = re.compile(
|
||
rf"(?:voit le jour|est n[eé]e?|naît)\s+le\s+{_D}"
|
||
rf"(?:\s+[àa]\s+(.*?))?(?:\.|$)", re.I)
|
||
RE_BORN_APPROX = re.compile(
|
||
rf"(?:voit le jour|est n[eé]e?|né[e]?)\s+vers\s+(?:({MOIS_PAT})\s+)?(\d{{4}})", re.I)
|
||
RE_BORN_YEAR = re.compile(
|
||
rf"(?:né[e]?|voit le jour)\s+(?:en|vers)\s+(\d{{4}})", re.I)
|
||
RE_BAPTISM = re.compile(
|
||
rf"(?:est baptisé[e]?)\s+[àa]\s+(.*?),\s+le\s+{_D}", re.I)
|
||
RE_BORN_INLINE = re.compile(r"né[e]?\s+vers\s+(\d{4})", re.I) # "née vers 1699"
|
||
RE_BORN_EN = re.compile(r"né[e]?\s+en\s+(\d{4})", re.I)
|
||
|
||
# Death
|
||
RE_DEAD_FULL = re.compile(
|
||
rf"est décédé[e]?\s+le\s+{_D}(?:.*?[àa]\s+([\w\s'\(\),\-]+?))?(?:\.|$)", re.I)
|
||
RE_DEAD_BEF = re.compile(r"est décédé[e]?\s+avant\s+(.+?)(?:\.|$)", re.I)
|
||
RE_DEAD_AFT = re.compile(r"est décédé[e]?\s+après\s+(\d{4})", re.I)
|
||
RE_DEAD_YEAR = re.compile(r"est décédé[e]?\s+en\s+(\d{4})", re.I)
|
||
# RE_MEURT: only matches when NOT preceded by a relative clause in the same sentence
|
||
# (sentences starting with "Sa mère/son père/son époux meurt" are excluded in parse_death)
|
||
RE_MEURT = re.compile(
|
||
rf"meurt\s+le\s+{_D}", re.I)
|
||
_RE_RELATIVE_MEURT = re.compile(
|
||
r"\b(?:sa|son)\s+(?:père|mère|époux|épouse|mari|femme|frère|sœur)\b[^.]*meurt", re.I)
|
||
|
||
# Marriage
|
||
RE_MARR = re.compile(
|
||
rf"(?:se marient|mariage (?:civil|religieux|est célébré))[^\d]*le\s+{_D}"
|
||
rf"(?:\s+[àa]\s+(.*?))?(?:\.|$)", re.I)
|
||
RE_CONTRAT = re.compile(
|
||
rf"contrat de mariage\s+le\s+{_D}"
|
||
rf"(?:\s+[àa]\s+(.*?))?(?:\.|$)", re.I)
|
||
RE_SPOUSE = re.compile(
|
||
r"(?:Il|Elle)\s+s[''`]unit\s+avec\s+(.*?)(?:,|\()", re.I)
|
||
RE_SPOUSE_DATES = re.compile(r"\(([~\d]{4})-?([~\d]{4})?\)", re.I)
|
||
|
||
# Occupation
|
||
RE_OCCU_SERA = re.compile(r"\w+\s+sera\s+([^.]+)\.", re.I)
|
||
RE_OCCU_EST = re.compile(r"\w+\s+est\s+([a-zéàèù][a-zéàèù\-\s]+?)[\.,]", re.I)
|
||
|
||
# Person ID header — matches standalone IDs like "1", "1a", "1.2", "1.2b", "1.4.4b.1"
|
||
RE_PERSON_ID = re.compile(r"^(\d+[a-z]?(?:\.\d+[a-z]?)*)\s*$", re.I)
|
||
|
||
# Name line (bold): "Pierre FABRE voit le jour..."
|
||
RE_NAME_LINE = re.compile(
|
||
r"^([A-ZÀ-Ü][a-zà-ü\-]+(?:\s+[A-ZÀ-Ü][a-zà-ü\-]+)*" # first name(s)
|
||
r"\s+[A-ZÀÂÇÉÈÊËÎÏÔÛÙÜŸÆŒ][A-ZÀÂÇÉÈÊËÎÏÔÛÙÜŸÆŒ\s'\-]+?)" # SURNAME
|
||
r"\s+(?:voit le jour|est né[e]?|naît|né[e]?\s)", re.I
|
||
)
|
||
|
||
# Sex from prose
|
||
RE_FILS = re.compile(r"\b(Il|Elle)\s+est\s+(?:l[e']|le)\s*fils\b", re.I)
|
||
RE_FILLE = re.compile(r"\b(Il|Elle)\s+est\s+(?:l[ae']|la)\s*fille\b", re.I)
|
||
RE_FILS2 = re.compile(r"\bfils\s+(?:légitim|naturel)", re.I)
|
||
RE_FILLE2 = re.compile(r"\bfille\s+(?:légitim|naturell)", re.I)
|
||
|
||
|
||
def parse_sex(full_text: str) -> str:
|
||
"""Return 'M', 'F', or '' from prose clues."""
|
||
if RE_FILS.search(full_text) or RE_FILS2.search(full_text):
|
||
return "M"
|
||
if RE_FILLE.search(full_text) or RE_FILLE2.search(full_text):
|
||
return "F"
|
||
# Pronoun fallback
|
||
if re.search(r"\bIl\s+est\b", full_text):
|
||
return "M"
|
||
if re.search(r"\bElle\s+est\b", full_text):
|
||
return "F"
|
||
return ""
|
||
|
||
|
||
def parse_birth(full_text: str) -> dict:
|
||
"""Extract birth/baptism date and place."""
|
||
result = {"date": "", "plac": "", "type": "BIRT"}
|
||
|
||
m = RE_BORN_FULL.search(full_text)
|
||
if m:
|
||
result["date"] = fr_date(m.group(1), m.group(2), m.group(3))
|
||
result["plac"] = _clean_place(m.group(4) or "")
|
||
return result
|
||
|
||
m = RE_BAPTISM.search(full_text)
|
||
if m:
|
||
result["type"] = "BAPM"
|
||
result["plac"] = _clean_place(m.group(1) or "")
|
||
result["date"] = fr_date(m.group(2), m.group(3), m.group(4))
|
||
return result
|
||
|
||
m = RE_BORN_APPROX.search(full_text)
|
||
if m:
|
||
result["date"] = fr_date(None, m.group(1), m.group(2), "ABT")
|
||
return result
|
||
|
||
for pat in (RE_BORN_YEAR, RE_BORN_EN, RE_BORN_INLINE):
|
||
m = pat.search(full_text)
|
||
if m:
|
||
result["date"] = "ABT " + m.group(1)
|
||
return result
|
||
|
||
return result
|
||
|
||
|
||
# Matches "à l'âge de ..." to be skipped in death sentences
|
||
_AGE_CLAUSE = re.compile(r",\s*[àa]\s+l['']\âge\s+de\s+[^,]+", re.I)
|
||
# Matches final place: last ", à Place" before period
|
||
_DEAD_PLACE = re.compile(r",\s*[àa]\s+([A-ZÀ-Ü][^,.]+?(?:\([A-Za-zÀ-Ü\s\-]+\))?)\s*(?:\.|$)", re.I)
|
||
|
||
|
||
def _extract_death_place(sentence: str) -> str:
|
||
"""Extract place from a death sentence, skipping 'à l'âge de' clauses."""
|
||
# Remove age clause so we don't pick it up as a place
|
||
cleaned = _AGE_CLAUSE.sub("", sentence)
|
||
# Find last place mention
|
||
matches = list(_DEAD_PLACE.finditer(cleaned))
|
||
if matches:
|
||
return _clean_place(matches[-1].group(1))
|
||
return ""
|
||
|
||
|
||
def parse_death(full_text: str) -> dict:
|
||
"""Extract death date and place."""
|
||
result = {"date": "", "plac": ""}
|
||
|
||
m = RE_DEAD_FULL.search(full_text)
|
||
if m:
|
||
result["date"] = fr_date(m.group(1), m.group(2), m.group(3))
|
||
# Extract place from the full death sentence separately
|
||
# Find the sentence that contains the match
|
||
sent_start = full_text.rfind("est décédé", 0, m.end())
|
||
if sent_start == -1:
|
||
sent_start = m.start()
|
||
sentence = full_text[sent_start:full_text.find(".", m.end()) + 1]
|
||
result["plac"] = _extract_death_place(sentence)
|
||
return result
|
||
|
||
# RE_MEURT: only when the sentence is about the main person, not a relative
|
||
m = RE_MEURT.search(full_text)
|
||
if m:
|
||
# Check the sentence containing this match
|
||
sent_start = full_text.rfind(".", 0, m.start())
|
||
sentence = full_text[sent_start + 1: full_text.find(".", m.end()) + 1]
|
||
if not _RE_RELATIVE_MEURT.search(sentence):
|
||
result["date"] = fr_date(m.group(1), m.group(2), m.group(3))
|
||
return result
|
||
|
||
m = RE_DEAD_BEF.search(full_text)
|
||
if m:
|
||
raw = m.group(1).strip().split(",")[0].rstrip(".")
|
||
result["date"] = "BEF " + raw
|
||
return result
|
||
|
||
m = RE_DEAD_AFT.search(full_text)
|
||
if m:
|
||
result["date"] = "AFT " + m.group(1)
|
||
return result
|
||
|
||
m = RE_DEAD_YEAR.search(full_text)
|
||
if m:
|
||
result["date"] = m.group(1)
|
||
return result
|
||
|
||
return result
|
||
|
||
|
||
_RE_NO_CHILDREN = re.compile(r"pas\s+d[''e]enfants|il\s+n[''y]\s+a\s+pas", re.I)
|
||
_RE_HAS_CHILDREN = re.compile(
|
||
r"(?:aura|a\s+eu|avez?|ont)\s+\w+\s+enfants?|(?:ce\s+couple|ils)\s+aura", re.I)
|
||
|
||
|
||
def _parse_one_marriage(segment: str) -> dict:
|
||
"""Parse spouse + date + place from a single 's'unit avec …' segment."""
|
||
result = {"date": "", "plac": "", "spouse": "",
|
||
"spouse_birth": "", "spouse_death": "", "spouse_occu": "",
|
||
"source": {},
|
||
"has_children_text": False}
|
||
|
||
spouse_info = _parse_spouse_context(segment)
|
||
if spouse_info.get("name"):
|
||
result["spouse"] = spouse_info["name"]
|
||
result["spouse_birth"] = spouse_info.get("birth", "")
|
||
result["spouse_death"] = spouse_info.get("death", "")
|
||
result["spouse_occu"] = spouse_info.get("occu", "")
|
||
else:
|
||
m = RE_SPOUSE.search(segment)
|
||
if m:
|
||
result["spouse"] = re.sub(r"\s*\(.*?\)", "", m.group(1).strip()).strip()
|
||
|
||
for pat in (RE_MARR, RE_CONTRAT):
|
||
m = pat.search(segment)
|
||
if m:
|
||
result["date"] = fr_date(m.group(1), m.group(2), m.group(3))
|
||
result["plac"] = _clean_place(m.group(4) or "")
|
||
break
|
||
|
||
# Detect inline children mention in this segment
|
||
result["has_children_text"] = (
|
||
bool(_RE_HAS_CHILDREN.search(segment))
|
||
and not bool(_RE_NO_CHILDREN.search(segment))
|
||
)
|
||
return result
|
||
|
||
|
||
def parse_marriages(full_text: str, italic_texts: list[str] | None = None) -> list[dict]:
|
||
"""
|
||
Return list of marriage dicts, one per union found in full_text.
|
||
Each dict: {spouse, spouse_birth, spouse_death, spouse_occu,
|
||
date, plac, source, has_children_text}.
|
||
"""
|
||
splits = [m.start() for m in re.finditer(r"\bs[''`]unit\s+avec\b", full_text, re.I)]
|
||
if not splits:
|
||
return []
|
||
|
||
marriages = []
|
||
for i, start in enumerate(splits):
|
||
end = splits[i + 1] if i + 1 < len(splits) else len(full_text)
|
||
seg = full_text[start:end]
|
||
marriages.append(_parse_one_marriage(seg))
|
||
|
||
# Assign contract sources from italic paragraphs to the best-matching marriage
|
||
for it in (italic_texts or []):
|
||
src = _parse_contract_source(it)
|
||
if not src:
|
||
continue
|
||
# Prefer marriage with a date; fall back to last
|
||
target = next((m for m in reversed(marriages) if m["date"]), marriages[-1])
|
||
if not target["source"]:
|
||
target["source"] = src
|
||
|
||
return marriages
|
||
|
||
|
||
def parse_occupation(full_text: str) -> str:
|
||
m = RE_OCCU_SERA.search(full_text)
|
||
if m:
|
||
return m.group(1).strip().rstrip(".")
|
||
return ""
|
||
|
||
|
||
def _clean_place(raw: str) -> str:
|
||
"""Normalise a place string extracted from HTML text."""
|
||
if not raw:
|
||
return ""
|
||
# Strip trailing punctuation (keep closing paren if place has department in parens)
|
||
p = raw.strip().rstrip(".,;(")
|
||
p = re.sub(r"\s+", " ", p).strip()
|
||
# Trim at known sentence-ending words
|
||
p = re.split(r"\s+(?:Il|Elle|Ce|Ils|Leur|Le|La|Les|Un|Une|Son|Sa)\b", p, maxsplit=1)[0]
|
||
return p[:80] # GEDCOM line limit
|
||
|
||
|
||
# ── HTML / paragraph parsing ──────────────────────────────────────────────────
|
||
|
||
def extract_paragraphs(html_body: str) -> list[dict]:
|
||
"""
|
||
Parse HTML body into a list of paragraph dicts:
|
||
{text, color, is_bold, bold_text}
|
||
Colors: black, navy, red, gray (from inline CSS or <font> color=).
|
||
"""
|
||
soup = BeautifulSoup(html_body, "html.parser")
|
||
|
||
def tag_color(tag) -> str:
|
||
style = tag.get("style", "")
|
||
m = re.search(r"color:\s*(\w+)", style)
|
||
if m:
|
||
return m.group(1).lower()
|
||
color_attr = tag.get("color", "")
|
||
if color_attr:
|
||
named = {"#000000": "black", "#000080": "navy", "navy": "navy",
|
||
"red": "red", "gray": "gray", "grey": "gray"}
|
||
return named.get(color_attr.lower(), color_attr.lower())
|
||
return ""
|
||
|
||
# Collect all block elements in document order:
|
||
# - all <p> tags
|
||
# - leaf <div> tags (no nested div children) — some families use divs instead of p
|
||
block_tags = [
|
||
tag for tag in soup.find_all(["p", "div"])
|
||
if tag.name == "p" or not tag.find("div")
|
||
]
|
||
|
||
paragraphs = []
|
||
for p in block_tags:
|
||
# Determine dominant color (first explicit color found)
|
||
color = "black"
|
||
for tag in p.descendants:
|
||
if hasattr(tag, "get"):
|
||
c = tag_color(tag)
|
||
if c:
|
||
color = c
|
||
break
|
||
|
||
# Bold detection — <b> or <strong>
|
||
bold_spans = p.find_all(["b", "strong"])
|
||
bold_text = " ".join(b.get_text(" ", strip=True) for b in bold_spans).strip()
|
||
is_bold = bool(bold_text)
|
||
|
||
# Full text
|
||
full_text = p.get_text(" ", strip=True).replace("\xa0", " ").strip()
|
||
full_text = re.sub(r"\s+", " ", full_text)
|
||
|
||
# Italic detection — whole paragraph is italic when all visible text is in <i>/<em>
|
||
italic_spans = p.find_all(["i", "em"])
|
||
is_italic = bool(italic_spans) and not is_bold
|
||
|
||
if full_text:
|
||
paragraphs.append({
|
||
"text": full_text,
|
||
"color": color,
|
||
"is_bold": is_bold,
|
||
"bold_text": re.sub(r"\s+", " ", bold_text),
|
||
"is_italic": is_italic,
|
||
})
|
||
|
||
return paragraphs
|
||
|
||
|
||
def split_into_person_blocks(paragraphs: list[dict]) -> list[dict]:
|
||
"""
|
||
Split paragraph list into person blocks using the bold ID pattern.
|
||
Returns list of {id, name_line, paras}.
|
||
|
||
Generation-1 pages have no explicit ID line; we assign id="1".
|
||
"""
|
||
blocks = []
|
||
current = None
|
||
|
||
for para in paragraphs:
|
||
text = para["text"]
|
||
bold_text = para["bold_text"]
|
||
|
||
# ── Is this a standalone person-ID line? ──
|
||
# Criterion: bold, black, and the ENTIRE text (stripped) is a valid ID
|
||
if para["is_bold"] and para["color"] in ("black", ""):
|
||
candidate = re.sub(r"[\s\xa0]+", "", bold_text)
|
||
full_stripped = re.sub(r"[\s\xa0]+", "", text)
|
||
if RE_PERSON_ID.match(candidate) and RE_PERSON_ID.match(full_stripped):
|
||
if current:
|
||
blocks.append(current)
|
||
current = {"id": candidate, "name_line": "", "paras": []}
|
||
continue
|
||
|
||
# ── Is this a name+birth line? (bold start, no id yet ──
|
||
if para["is_bold"] and current is not None and not current["name_line"]:
|
||
current["name_line"] = text
|
||
current["paras"].append(para)
|
||
continue
|
||
|
||
# ── Generation-1 edge case: first bold non-id paragraph ──
|
||
if para["is_bold"] and current is None:
|
||
# Likely the gen-1 title paragraph, skip
|
||
# But if it looks like a name+birth, create implicit id="0" (root ancestor)
|
||
if RE_NAME_LINE.match(bold_text) or RE_NAME_LINE.match(text):
|
||
current = {"id": "0", "name_line": text, "paras": [para]}
|
||
continue
|
||
|
||
if current is not None:
|
||
current["paras"].append(para)
|
||
|
||
if current:
|
||
blocks.append(current)
|
||
|
||
return blocks
|
||
|
||
|
||
def parse_block(block: dict, family_name: str) -> dict:
|
||
"""
|
||
Convert a person block into a structured person dict.
|
||
"""
|
||
person_id = block["id"]
|
||
name_line = block["name_line"]
|
||
all_paras = block["paras"]
|
||
full_text = " ".join(p["text"] for p in all_paras)
|
||
gray_notes = [p["text"] for p in all_paras if p["color"] == "gray"]
|
||
|
||
# ── Name ──
|
||
# Prefer the bold text of the first paragraph (reliable) over regex extraction
|
||
bold0 = (all_paras[0]["bold_text"] if all_paras else "").strip()
|
||
if bold0 and not RE_PERSON_ID.match(re.sub(r"[\s\xa0]+", "", bold0)):
|
||
name = bold0
|
||
elif name_line:
|
||
# Fallback: extract name portion before birth keyword
|
||
name = re.split(r"\s+(?:voit le jour|est né[e]?|naît|né[e]?\s)", name_line, maxsplit=1)[0].strip()
|
||
else:
|
||
name = ""
|
||
|
||
# Split name into given/surname: surname is the ALL-CAPS part
|
||
given, surname = _split_name(name)
|
||
|
||
# ── Sex — use only the intro sentence (before spouse description) ──
|
||
# Searching the whole block picks up spouse/child "fils légitime" etc.
|
||
intro_end = re.search(r"\bs[''`]unit\s+avec\b", full_text)
|
||
sex_context = full_text[: intro_end.start()] if intro_end else full_text[:500]
|
||
sex = parse_sex(sex_context)
|
||
# Secondary: "est née" / "est né" in the name line
|
||
if not sex:
|
||
if re.search(r"\best\s+née\b", full_text[:300], re.I):
|
||
sex = "F"
|
||
elif re.search(r"\best\s+né\b(?!e)", full_text[:300], re.I):
|
||
sex = "M"
|
||
# Fallback from name color in later text
|
||
if not sex and all_paras:
|
||
for p in all_paras:
|
||
if p["color"] == "navy":
|
||
sex = "M"; break
|
||
if p["color"] == "red":
|
||
sex = "F"; break
|
||
|
||
# ── Birth / Baptism ──
|
||
birth = parse_birth(full_text)
|
||
|
||
# ── Death ──
|
||
death = parse_death(full_text)
|
||
|
||
# ── Marriages (may be plural) ──
|
||
italic_texts = [p["text"] for p in all_paras if p.get("is_italic")]
|
||
marriages = parse_marriages(full_text, italic_texts)
|
||
|
||
# ── Occupation ──
|
||
occu = parse_occupation(full_text)
|
||
|
||
# ── Gray paragraph groups (witnesses, godparents, general notes) ──
|
||
grouped = _group_gray_notes(all_paras)
|
||
|
||
# Attach marriage notes to the first marriage; create minimal entry if needed
|
||
if grouped["marriage_notes"]:
|
||
if marriages:
|
||
marriages[0].setdefault("notes", []).extend(grouped["marriage_notes"])
|
||
else:
|
||
marriages = [{
|
||
"date": "", "plac": "", "spouse": "",
|
||
"spouse_birth": "", "spouse_death": "", "spouse_occu": "",
|
||
"source": {}, "has_children_text": False,
|
||
"notes": list(grouped["marriage_notes"]),
|
||
}]
|
||
|
||
# ── Children listed inline ──
|
||
children_inline = _extract_children_inline(full_text)
|
||
|
||
return {
|
||
"id": person_id,
|
||
"family": family_name,
|
||
"name": name,
|
||
"given": given,
|
||
"surname": surname,
|
||
"sex": sex,
|
||
"birth": birth,
|
||
"death": death,
|
||
"marriages": marriages,
|
||
"occupation": occu,
|
||
"birth_notes": grouped["birth_notes"],
|
||
"death_notes": grouped["death_notes"],
|
||
"general_notes": grouped["general_notes"],
|
||
"children_inline": children_inline,
|
||
"full_text": full_text,
|
||
}
|
||
|
||
|
||
def _split_name(name: str) -> tuple[str, str]:
|
||
"""Split 'Pierre FABRE' → ('Pierre', 'FABRE')."""
|
||
# Surname = longest contiguous run of uppercase tokens at the END
|
||
tokens = name.split()
|
||
surname_tokens = []
|
||
given_tokens = []
|
||
i = len(tokens) - 1
|
||
while i >= 0 and (tokens[i].upper() == tokens[i] or tokens[i] in ("de", "d'", "du", "des", "l'", "la", "le")):
|
||
surname_tokens.insert(0, tokens[i])
|
||
i -= 1
|
||
given_tokens = tokens[:i+1]
|
||
return " ".join(given_tokens), " ".join(surname_tokens)
|
||
|
||
|
||
def _extract_children_inline(text: str) -> list[str]:
|
||
"""
|
||
Extract names of children listed as '- Name né(e) en YEAR'
|
||
Returns list of first-name strings.
|
||
"""
|
||
children = []
|
||
for m in re.finditer(r"-\s+([A-ZÀ-Ü][a-zà-üA-ZÀ-Ü\s]+?)\s+né", text):
|
||
children.append(m.group(1).strip())
|
||
return children
|
||
|
||
|
||
# ── SQL extraction ────────────────────────────────────────────────────────────
|
||
|
||
def parse_sql_values(line: str) -> list:
|
||
s = line.strip()
|
||
if s.startswith("("): s = s[1:]
|
||
for sfx in (");", "),", ")"):
|
||
if s.endswith(sfx):
|
||
s = s[:-len(sfx)]; break
|
||
ESC = {"n":"\n","r":"\r","t":"\t","b":"\x08",
|
||
"\\":"\\"," '":"'",'"':'"',"0":"\x00","Z":"\x1a"}
|
||
values, i, n = [], 0, len(s)
|
||
while i < n:
|
||
while i < n and s[i] in " \t": i += 1
|
||
if i >= n: break
|
||
if s[i:i+4] == "NULL":
|
||
values.append(None); i += 4
|
||
elif s[i] == "'":
|
||
i += 1; buf = []
|
||
while i < n:
|
||
c = s[i]
|
||
if c == "\\" and i+1 < n:
|
||
buf.append(ESC.get(s[i+1], s[i+1])); i += 2
|
||
elif c == "'": i += 1; break
|
||
else: buf.append(c); i += 1
|
||
values.append("".join(buf))
|
||
else:
|
||
j = i
|
||
while j < n and s[j] != ",": j += 1
|
||
values.append(s[i:j].strip()); i = j
|
||
while i < n and s[i] in " \t,": i += 1
|
||
return values
|
||
|
||
|
||
def stream_filiation_nodes(sql_file: Path) -> dict[int, dict]:
|
||
"""
|
||
One-pass stream: collect title+type from drupal_node,
|
||
body from drupal_node_revisions for all 'book' type nodes
|
||
whose title contains 'filiation'.
|
||
Returns {nid: {title, body}}.
|
||
"""
|
||
node_cols = []
|
||
rev_cols = []
|
||
nodes: dict[int, dict] = {}
|
||
|
||
current_table = None
|
||
TARGETS = {"drupal_node", "drupal_node_revisions"}
|
||
INSERT_RE = re.compile(r"INSERT INTO `([^`]+)` \((.+)\) VALUES", re.I)
|
||
|
||
with open(sql_file, encoding="utf-8", errors="replace") as fh:
|
||
for line in fh:
|
||
ls = line.rstrip("\r\n")
|
||
|
||
m = INSERT_RE.match(ls)
|
||
if m:
|
||
tname = m.group(1)
|
||
current_table = tname if tname in TARGETS else None
|
||
if tname == "drupal_node":
|
||
node_cols = [c.strip().strip("`") for c in m.group(2).split(",")]
|
||
elif tname == "drupal_node_revisions":
|
||
rev_cols = [c.strip().strip("`") for c in m.group(2).split(",")]
|
||
continue
|
||
|
||
if current_table is None:
|
||
continue
|
||
|
||
stripped = ls.strip()
|
||
if not stripped.startswith("("):
|
||
if stripped.endswith(";"):
|
||
current_table = None
|
||
continue
|
||
|
||
row = parse_sql_values(stripped)
|
||
|
||
if current_table == "drupal_node" and node_cols:
|
||
d = dict(zip(node_cols, row))
|
||
if d.get("type") == "book":
|
||
title = d.get("title", "")
|
||
if "filiation" in title.lower() or "filiations" in title.lower():
|
||
try:
|
||
nid = int(d["nid"])
|
||
except (ValueError, TypeError):
|
||
pass
|
||
else:
|
||
nodes[nid] = {"title": title, "body": ""}
|
||
|
||
elif current_table == "drupal_node_revisions" and rev_cols:
|
||
d = dict(zip(rev_cols, row))
|
||
try:
|
||
nid = int(d["nid"])
|
||
except (ValueError, TypeError):
|
||
continue
|
||
if nid in nodes:
|
||
nodes[nid]["body"] = d.get("body") or ""
|
||
|
||
if stripped.endswith(";"):
|
||
current_table = None
|
||
|
||
return nodes
|
||
|
||
|
||
# ── Family grouping ───────────────────────────────────────────────────────────
|
||
|
||
def family_name_from_title(title: str) -> str:
|
||
"""'Les filiations FABRE : Génération 3' → 'FABRE'"""
|
||
t = title.replace("Les filiations", "").strip()
|
||
# Remove suffix starting at ' :'
|
||
t = t.split(":")[0].strip()
|
||
# Remove leading d', de , d'
|
||
t = re.sub(r"^(?:d[''e]\s*|de\s+|du\s+|des\s+|l['']\s*)", "", t, flags=re.I)
|
||
return t.strip()
|
||
|
||
|
||
def generation_number(title: str) -> int:
|
||
"""Extract generation number; 0 for root/présentation pages."""
|
||
m = re.search(r"[Gg]én[eé]ration\s+(\d+)", title)
|
||
return int(m.group(1)) if m else 0
|
||
|
||
|
||
def group_by_family(nodes: dict[int, dict]) -> dict[str, list[dict]]:
|
||
"""
|
||
Return {family_name: [sorted list of {nid, title, body, gen}]}
|
||
Only includes generation pages (gen > 0) and the root page (gen == 0
|
||
when not 'présentation').
|
||
"""
|
||
families: dict[str, list] = {}
|
||
for nid, info in nodes.items():
|
||
title = info["title"]
|
||
fname = family_name_from_title(title)
|
||
gen = generation_number(title)
|
||
if "présentation" in title.lower():
|
||
continue # skip intro pages
|
||
if not fname:
|
||
continue
|
||
families.setdefault(fname, []).append(
|
||
{"nid": nid, "title": title, "body": info["body"], "gen": gen}
|
||
)
|
||
|
||
for fname in families:
|
||
families[fname].sort(key=lambda x: (x["gen"] == 0, x["gen"]))
|
||
|
||
return families
|
||
|
||
|
||
# ── Cross-page person assembly ────────────────────────────────────────────────
|
||
|
||
def parent_id(person_id: str) -> str | None:
|
||
"""
|
||
Given a person ID like '1.4.4b.1', return the parent's ID '1.4.4b'.
|
||
Returns None for root.
|
||
"""
|
||
parts = person_id.rsplit(".", 1)
|
||
if len(parts) == 1:
|
||
return None
|
||
return parts[0] if parts[0] else None
|
||
|
||
|
||
def child_union_letter(person_id: str) -> str:
|
||
"""
|
||
Return the union letter from the last component of a person ID.
|
||
'5.1.7.1a.5b.3a.7.1a' → 'a' (child 1 of union "a" of parent)
|
||
'5.1.7.1a.5b.3a.7.2' → '' (no explicit union letter)
|
||
"""
|
||
last = person_id.rsplit(".", 1)[-1]
|
||
m = re.match(r"^\d+([a-z]*)$", last, re.I)
|
||
return m.group(1).lower() if m else ""
|
||
|
||
|
||
# ── GEDCOM generation ─────────────────────────────────────────────────────────
|
||
|
||
_indi_counter = 0
|
||
_fam_counter = 0
|
||
_sour_counter = 0
|
||
_repo_counter = 0
|
||
|
||
# Registries reset per file
|
||
_sour_registry: dict[tuple, str] = {} # (title_lc, depot_lc, caln_lc) → xref
|
||
_repo_registry: dict[str, str] = {} # depot_lc → xref
|
||
_sour_records: dict[str, dict] = {} # xref → {title, depot_xref, caln}
|
||
_repo_records: dict[str, str] = {} # xref → name
|
||
|
||
|
||
def new_indi() -> str:
|
||
global _indi_counter
|
||
_indi_counter += 1
|
||
return f"@I{_indi_counter:04d}@"
|
||
|
||
|
||
def new_fam() -> str:
|
||
global _fam_counter
|
||
_fam_counter += 1
|
||
return f"@F{_fam_counter:04d}@"
|
||
|
||
|
||
def new_sour() -> str:
|
||
global _sour_counter
|
||
_sour_counter += 1
|
||
return f"@S{_sour_counter:04d}@"
|
||
|
||
|
||
def new_repo() -> str:
|
||
global _repo_counter
|
||
_repo_counter += 1
|
||
return f"@R{_repo_counter:04d}@"
|
||
|
||
|
||
def _get_or_create_repo(depot: str) -> str:
|
||
key = depot.strip().lower()
|
||
if key in _repo_registry:
|
||
return _repo_registry[key]
|
||
rx = new_repo()
|
||
_repo_registry[key] = rx
|
||
_repo_records[rx] = depot.strip()
|
||
return rx
|
||
|
||
|
||
def _get_or_create_sour(title: str, depot: str, caln: str) -> str:
|
||
key = (title.strip().lower(), depot.strip().lower(), caln.strip().lower())
|
||
if key in _sour_registry:
|
||
return _sour_registry[key]
|
||
rx = new_sour()
|
||
_sour_registry[key] = rx
|
||
repo_xref = _get_or_create_repo(depot) if depot else ""
|
||
_sour_records[rx] = {"title": title.strip(), "repo_xref": repo_xref, "caln": caln.strip()}
|
||
return rx
|
||
|
||
|
||
def gedcom_line(level: int, tag: str, value: str = "") -> str:
|
||
line = f"{level} {tag}"
|
||
if value:
|
||
line += f" {value}"
|
||
return line
|
||
|
||
|
||
def person_to_gedcom(person: dict, indi_ref: str,
|
||
famc: list[str], fams: list[str]) -> list[str]:
|
||
"""Build GEDCOM INDI record lines for one person."""
|
||
lines = [gedcom_line(0, indi_ref, "INDI")]
|
||
|
||
# Name
|
||
given = person.get("given", "")
|
||
surname = person.get("surname", "")
|
||
full = f"{given} /{surname}/" if surname else given
|
||
if full:
|
||
lines.append(gedcom_line(1, "NAME", full))
|
||
if given:
|
||
lines.append(gedcom_line(2, "GIVN", given))
|
||
if surname:
|
||
lines.append(gedcom_line(2, "SURN", surname))
|
||
|
||
# Sex
|
||
sex = person.get("sex", "")
|
||
if sex:
|
||
lines.append(gedcom_line(1, "SEX", sex))
|
||
|
||
# Birth / Baptism
|
||
birth = person.get("birth", {})
|
||
birth_has_data = birth.get("date") or birth.get("plac")
|
||
birth_notes = person.get("birth_notes", [])
|
||
if birth_has_data or birth_notes:
|
||
event_tag = birth.get("type", "BIRT")
|
||
lines.append(gedcom_line(1, event_tag))
|
||
if birth_has_data:
|
||
_t, _raw = _split_place(birth.get("plac", ""))
|
||
if birth.get("date"):
|
||
lines.append(gedcom_line(2, "DATE", birth["date"]))
|
||
if _t:
|
||
lines.append(gedcom_line(3, "TIME", _t))
|
||
elif _t:
|
||
lines.append(gedcom_line(2, "TIME", _t))
|
||
_p = _expand_place(_raw)
|
||
if _p:
|
||
lines.append(gedcom_line(2, "PLAC", _p))
|
||
lines.extend(_emit_grouped_notes(birth_notes, base_level=2))
|
||
|
||
# Death
|
||
death = person.get("death", {})
|
||
death_has_data = death.get("date") or death.get("plac")
|
||
death_notes = person.get("death_notes", [])
|
||
if death_has_data or death_notes:
|
||
lines.append(gedcom_line(1, "DEAT"))
|
||
if death_has_data:
|
||
_t, _raw = _split_place(death.get("plac", ""))
|
||
if death.get("date"):
|
||
lines.append(gedcom_line(2, "DATE", death["date"]))
|
||
if _t:
|
||
lines.append(gedcom_line(3, "TIME", _t))
|
||
elif _t:
|
||
lines.append(gedcom_line(2, "TIME", _t))
|
||
_p = _expand_place(_raw)
|
||
if _p:
|
||
lines.append(gedcom_line(2, "PLAC", _p))
|
||
lines.extend(_emit_grouped_notes(death_notes, base_level=2))
|
||
|
||
# Occupation
|
||
occu = person.get("occupation", "")
|
||
if occu:
|
||
lines.append(gedcom_line(1, "OCCU", occu))
|
||
|
||
# Family links
|
||
for fc in famc:
|
||
lines.append(gedcom_line(1, "FAMC", fc))
|
||
for fs in fams:
|
||
lines.append(gedcom_line(1, "FAMS", fs))
|
||
|
||
# General notes (INDI level)
|
||
lines.extend(_emit_grouped_notes(person.get("general_notes", []), base_level=1))
|
||
|
||
return lines
|
||
|
||
|
||
def build_gedcom_for_family(family_name: str,
|
||
pages: list[dict]) -> list[str]:
|
||
"""
|
||
Parse all generation pages for a family, build persons dict,
|
||
resolve links, and emit GEDCOM lines.
|
||
"""
|
||
global _indi_counter, _fam_counter
|
||
|
||
# ── Step 1: parse all pages into a flat persons dict ──
|
||
persons_by_id: dict[str, dict] = {} # person_id → person data
|
||
|
||
for page in pages:
|
||
body = page["body"]
|
||
if not body.strip():
|
||
continue
|
||
paras = extract_paragraphs(body)
|
||
blocks = split_into_person_blocks(paras)
|
||
for block in blocks:
|
||
p = parse_block(block, family_name)
|
||
pid = p["id"]
|
||
if pid in persons_by_id:
|
||
# Merge: later pages may have more detail
|
||
existing = persons_by_id[pid]
|
||
for field in ("birth", "death", "occupation"):
|
||
if not existing.get(field) and p.get(field):
|
||
existing[field] = p[field]
|
||
for notes_field in ("birth_notes", "death_notes", "general_notes"):
|
||
existing.setdefault(notes_field, []).extend(p.get(notes_field, []))
|
||
# Merge marriages: append new spouses not already known
|
||
if p.get("marriages"):
|
||
ex_spouses = {m["spouse"].lower() for m in existing.get("marriages", [])}
|
||
for nm in p["marriages"]:
|
||
if nm["spouse"].lower() not in ex_spouses:
|
||
existing.setdefault("marriages", []).append(nm)
|
||
ex_spouses.add(nm["spouse"].lower())
|
||
# Update sex if missing
|
||
if not existing.get("sex") and p.get("sex"):
|
||
existing["sex"] = p["sex"]
|
||
else:
|
||
persons_by_id[pid] = p
|
||
|
||
if not persons_by_id:
|
||
return []
|
||
|
||
# ── Step 2: assign INDI xrefs ──
|
||
xref: dict[str, str] = {}
|
||
for pid in sorted(persons_by_id.keys()):
|
||
xref[pid] = new_indi()
|
||
|
||
# ── Step 3: resolve parent→child links and union letters ──
|
||
|
||
# "0" is the implicit root (gen-1 ancestor)
|
||
has_root = "0" in persons_by_id
|
||
|
||
# For each child: which union letter do they belong to?
|
||
# child_ul[pid] = "" | "a" | "b" | ...
|
||
# parent_union_letters[parent_pid] = sorted set of union letters seen in children
|
||
child_ul: dict[str, str] = {}
|
||
parent_union_letters: dict[str, list[str]] = {}
|
||
for pid in persons_by_id:
|
||
if pid == "0":
|
||
continue
|
||
par = parent_id(pid)
|
||
if par is None and has_root:
|
||
par = "0"
|
||
if par and par in persons_by_id:
|
||
ul = child_union_letter(pid)
|
||
child_ul[pid] = ul
|
||
ls = parent_union_letters.setdefault(par, [])
|
||
if ul not in ls:
|
||
ls.append(ul)
|
||
|
||
for ls in parent_union_letters.values():
|
||
ls.sort()
|
||
|
||
# fam_key = parent_pid + "#" + union_letter (or parent_pid if only 1 marriage)
|
||
# We determine fam_keys from marriages list AND from children's actual union letters.
|
||
#
|
||
# Strategy: for each parent:
|
||
# - union letters from children tell us which unions produced descendants
|
||
# - text marriages list tells us all unions (including childless ones)
|
||
# - We match: marriages marked has_children_text → union letters (in order)
|
||
# remaining marriages → synthetic childless keys
|
||
|
||
def fam_keys_for_parent(par_pid: str) -> list[str]:
|
||
"""
|
||
Return ordered list of fam_keys for this parent's marriages.
|
||
One fam_key per marriage in text order.
|
||
"""
|
||
marriages = persons_by_id[par_pid].get("marriages", [])
|
||
if not marriages:
|
||
return []
|
||
if len(marriages) == 1:
|
||
# Single marriage: use letters from children, or bare parent_pid
|
||
uls = parent_union_letters.get(par_pid, [""])
|
||
return [f"{par_pid}#{uls[0]}" if uls else par_pid]
|
||
|
||
# Multiple marriages: split into "with-children" and "childless" groups
|
||
# using the text hint, then map union letters
|
||
union_letters = sorted(parent_union_letters.get(par_pid, []))
|
||
with_children = [m for m in marriages if m.get("has_children_text")]
|
||
without_children = [m for m in marriages if not m.get("has_children_text")]
|
||
|
||
# Fallback: if text detection failed, assume last marriage has children
|
||
if not with_children and union_letters:
|
||
with_children = [marriages[-1]]
|
||
without_children = marriages[:-1]
|
||
|
||
keys = []
|
||
ul_iter = iter(union_letters)
|
||
childless_idx = [0]
|
||
|
||
for m in marriages:
|
||
if m in with_children:
|
||
ul = next(ul_iter, f"_ul{len(keys)}")
|
||
keys.append(f"{par_pid}#{ul}")
|
||
else:
|
||
keys.append(f"{par_pid}#childless{childless_idx[0]}")
|
||
childless_idx[0] += 1
|
||
return keys
|
||
|
||
# Build fam_xrefs: fam_key → GEDCOM @Fxxxx@ xref
|
||
fam_xrefs: dict[str, str] = {}
|
||
|
||
# Build famc map: child_pid → fam_key (which FAM this child belongs to)
|
||
famc_fam: dict[str, str] = {}
|
||
|
||
for pid in persons_by_id:
|
||
if pid == "0":
|
||
continue
|
||
par = parent_id(pid)
|
||
if par is None and has_root:
|
||
par = "0"
|
||
if not (par and par in persons_by_id):
|
||
continue
|
||
|
||
# Determine which fam_key this child belongs to
|
||
ul = child_ul.get(pid, "")
|
||
par_marriages = persons_by_id[par].get("marriages", [])
|
||
par_keys = fam_keys_for_parent(par)
|
||
|
||
# Create fam_xrefs for all marriages of parent if not yet done
|
||
for fk in par_keys:
|
||
if fk not in fam_xrefs:
|
||
fam_xrefs[fk] = new_fam()
|
||
|
||
# Match child to correct fam_key by union letter
|
||
# fam_key format: "parent#ul" or "parent" (single marriage)
|
||
matched_key = None
|
||
if par_keys:
|
||
# Try to find the key that contains this union letter
|
||
for fk in par_keys:
|
||
suffix = fk.split("#", 1)[1] if "#" in fk else ""
|
||
if suffix == ul or (not suffix and not ul):
|
||
matched_key = fk
|
||
break
|
||
if matched_key is None:
|
||
matched_key = par_keys[0] # fallback
|
||
if matched_key:
|
||
famc_fam[pid] = matched_key
|
||
|
||
# Also ensure FAM records exist for parents who only have marriages (no children in tree)
|
||
for par_pid, person in persons_by_id.items():
|
||
if not person.get("marriages"):
|
||
continue
|
||
par_keys = fam_keys_for_parent(par_pid)
|
||
for fk in par_keys:
|
||
if fk not in fam_xrefs:
|
||
fam_xrefs[fk] = new_fam()
|
||
|
||
# Build reverse: fam_key → list of child_pids
|
||
fam_children: dict[str, list[str]] = {}
|
||
for child_pid, fk in famc_fam.items():
|
||
fam_children.setdefault(fk, []).append(child_pid)
|
||
|
||
# person_famc: child_pid → @Fxxxx@ xref
|
||
person_famc: dict[str, str] = {
|
||
pid: fam_xrefs[fk] for pid, fk in famc_fam.items() if fk in fam_xrefs
|
||
}
|
||
# person_fams: parent_pid → list of @Fxxxx@ xrefs (one per marriage)
|
||
person_fams: dict[str, list[str]] = {}
|
||
for fk, fr in fam_xrefs.items():
|
||
par_pid = fk.split("#")[0]
|
||
person_fams.setdefault(par_pid, []).append(fr)
|
||
|
||
# ── Step 4: emit GEDCOM ──
|
||
lines = []
|
||
|
||
# spouse_data: name_lc → {xref, sex, birth, death, occu, fams}
|
||
spouse_data: dict[str, dict] = {}
|
||
|
||
def get_or_create_spouse(name: str, sex: str) -> str:
|
||
key = name.strip().lower()
|
||
if key not in spouse_data:
|
||
sx = new_indi()
|
||
spouse_data[key] = {"xref": sx, "name": name, "sex": sex,
|
||
"birth": "", "death": "", "occu": "", "fams": []}
|
||
return spouse_data[key]["xref"]
|
||
|
||
# INDI records for known persons
|
||
for pid, person in sorted(persons_by_id.items()):
|
||
indi_ref = xref[pid]
|
||
famc_list = [person_famc[pid]] if pid in person_famc else []
|
||
fams_list = person_fams.get(pid, [])
|
||
lines += person_to_gedcom(person, indi_ref, famc_list, fams_list)
|
||
|
||
# FAM records — one per fam_key
|
||
for fam_key, fam_ref in fam_xrefs.items():
|
||
par_pid = fam_key.split("#")[0]
|
||
parent = persons_by_id.get(par_pid, {})
|
||
par_sex = parent.get("sex", "")
|
||
par_xref = xref.get(par_pid, "")
|
||
|
||
# Identify which marriage this FAM corresponds to
|
||
par_keys = fam_keys_for_parent(par_pid)
|
||
try:
|
||
marr_idx = par_keys.index(fam_key)
|
||
except ValueError:
|
||
marr_idx = 0
|
||
marriages = parent.get("marriages", [])
|
||
marr = marriages[marr_idx] if marr_idx < len(marriages) else {}
|
||
|
||
lines.append(gedcom_line(0, fam_ref, "FAM"))
|
||
|
||
# Parent as HUSB or WIFE
|
||
if par_sex == "F":
|
||
lines.append(gedcom_line(1, "WIFE", par_xref))
|
||
else:
|
||
lines.append(gedcom_line(1, "HUSB", par_xref))
|
||
|
||
# Spouse
|
||
spouse_name = marr.get("spouse", "")
|
||
if spouse_name:
|
||
skey = spouse_name.strip().lower()
|
||
spouse_sex = "F" if par_sex == "M" else "M"
|
||
get_or_create_spouse(spouse_name, spouse_sex)
|
||
sd = spouse_data[skey]
|
||
if not sd["birth"] and marr.get("spouse_birth"):
|
||
sd["birth"] = marr["spouse_birth"]
|
||
if not sd["death"] and marr.get("spouse_death"):
|
||
sd["death"] = marr["spouse_death"]
|
||
if not sd["occu"] and marr.get("spouse_occu"):
|
||
sd["occu"] = marr["spouse_occu"]
|
||
sd["fams"].append(fam_ref)
|
||
spouse_xref = sd["xref"]
|
||
if par_sex == "M":
|
||
lines.append(gedcom_line(1, "WIFE", spouse_xref))
|
||
else:
|
||
lines.append(gedcom_line(1, "HUSB", spouse_xref))
|
||
|
||
# Marriage event
|
||
marr_notes = marr.get("notes", [])
|
||
if marr.get("date") or marr.get("plac") or marr_notes:
|
||
lines.append(gedcom_line(1, "MARR"))
|
||
_t, _raw = _split_place(marr.get("plac", ""))
|
||
if marr.get("date"):
|
||
lines.append(gedcom_line(2, "DATE", marr["date"]))
|
||
if _t:
|
||
lines.append(gedcom_line(3, "TIME", _t))
|
||
elif _t:
|
||
lines.append(gedcom_line(2, "TIME", _t))
|
||
_p = _expand_place(_raw)
|
||
if _p:
|
||
lines.append(gedcom_line(2, "PLAC", _p))
|
||
src = marr.get("source", {})
|
||
if src.get("title"):
|
||
sour_xref = _get_or_create_sour(
|
||
src["title"], src.get("depot", ""), src.get("caln", ""))
|
||
lines.append(gedcom_line(2, "SOUR", sour_xref))
|
||
if src.get("page"):
|
||
lines.append(gedcom_line(3, "PAGE", src["page"]))
|
||
lines.extend(_emit_grouped_notes(marr_notes, base_level=2))
|
||
|
||
# Children belonging to this FAM
|
||
for child_pid in sorted(fam_children.get(fam_key, [])):
|
||
child_xref = xref.get(child_pid, "")
|
||
if child_xref:
|
||
lines.append(gedcom_line(1, "CHIL", child_xref))
|
||
|
||
# Spouse INDI records — emitted AFTER FAM loop so spouse_data is complete
|
||
for sd in spouse_data.values():
|
||
sx = sd["xref"]
|
||
s_name = sd["name"]
|
||
given, surname = _split_name(s_name)
|
||
# Preserve original capitalisation for the given name; surname stays upper
|
||
if not given:
|
||
given, surname = _split_name(s_name.title())
|
||
full = f"{given} /{surname}/" if surname else (given or s_name)
|
||
lines.append(gedcom_line(0, sx, "INDI"))
|
||
lines.append(gedcom_line(1, "NAME", full))
|
||
if given:
|
||
lines.append(gedcom_line(2, "GIVN", given))
|
||
if surname:
|
||
lines.append(gedcom_line(2, "SURN", surname))
|
||
if sd["sex"]:
|
||
lines.append(gedcom_line(1, "SEX", sd["sex"]))
|
||
if sd["birth"]:
|
||
lines.append(gedcom_line(1, "BIRT"))
|
||
lines.append(gedcom_line(2, "DATE", sd["birth"]))
|
||
if sd["death"]:
|
||
lines.append(gedcom_line(1, "DEAT"))
|
||
lines.append(gedcom_line(2, "DATE", sd["death"]))
|
||
if sd["occu"]:
|
||
lines.append(gedcom_line(1, "OCCU", sd["occu"]))
|
||
for fref in sd["fams"]:
|
||
lines.append(gedcom_line(1, "FAMS", fref))
|
||
|
||
# REPO records
|
||
for rx, rname in _repo_records.items():
|
||
lines.append(gedcom_line(0, rx, "REPO"))
|
||
lines.append(gedcom_line(1, "NAME", rname))
|
||
|
||
# SOUR records
|
||
for sx, srec in _sour_records.items():
|
||
lines.append(gedcom_line(0, sx, "SOUR"))
|
||
lines.append(gedcom_line(1, "TITL", srec["title"]))
|
||
if srec["repo_xref"]:
|
||
lines.append(gedcom_line(1, "REPO", srec["repo_xref"]))
|
||
if srec["caln"]:
|
||
lines.append(gedcom_line(2, "CALN", srec["caln"]))
|
||
|
||
return lines
|
||
|
||
|
||
def build_gedcom_file(family_name: str, pages: list[dict]) -> str:
|
||
"""Return complete GEDCOM file content for one family."""
|
||
global _indi_counter, _fam_counter, _sour_counter, _repo_counter
|
||
global _sour_registry, _repo_registry, _sour_records, _repo_records
|
||
# Reset counters and registries per file
|
||
_indi_counter = 0
|
||
_fam_counter = 0
|
||
_sour_counter = 0
|
||
_repo_counter = 0
|
||
_sour_registry = {}
|
||
_repo_registry = {}
|
||
_sour_records = {}
|
||
_repo_records = {}
|
||
|
||
body_lines = build_gedcom_for_family(family_name, pages)
|
||
|
||
header = [
|
||
"0 HEAD",
|
||
"1 SOUR BaseCGL",
|
||
f"2 NAME {SOURCE_STR}",
|
||
"1 GEDC",
|
||
"2 VERS 5.5.1",
|
||
"2 FORM LINEAGE-LINKED",
|
||
"1 CHAR UTF-8",
|
||
f"1 NOTE Filiations {family_name} – export automatique depuis les pages Drupal",
|
||
]
|
||
trailer = ["0 TRLR"]
|
||
|
||
return "\n".join(header + body_lines + trailer) + "\n"
|
||
|
||
|
||
# ── Main ──────────────────────────────────────────────────────────────────────
|
||
|
||
def main():
|
||
OUT_DIR.mkdir(exist_ok=True)
|
||
|
||
print(f"Streaming {SQL_FILE} …", flush=True)
|
||
nodes = stream_filiation_nodes(SQL_FILE)
|
||
print(f" Found {len(nodes)} filiation book pages")
|
||
|
||
families = group_by_family(nodes)
|
||
print(f" Found {len(families)} families: {', '.join(sorted(families))}")
|
||
|
||
total_persons = 0
|
||
for fname in sorted(families):
|
||
pages = families[fname]
|
||
gen_pages = [p for p in pages if p["gen"] > 0]
|
||
root_pages = [p for p in pages if p["gen"] == 0]
|
||
all_pages = root_pages + gen_pages
|
||
|
||
print(f"\n{fname}: {len(all_pages)} pages ({len(gen_pages)} generations)")
|
||
|
||
gedcom = build_gedcom_file(fname, all_pages)
|
||
|
||
out_path = OUT_DIR / f"filiations_{fname.replace(' ', '_').replace(chr(39), '')}.ged"
|
||
out_path.write_text(gedcom, encoding="utf-8")
|
||
|
||
# Count INDI / FAM records by their level-0 xref prefix
|
||
n_indi = gedcom.count("0 @I")
|
||
n_fam = gedcom.count("0 @F")
|
||
total_persons += n_indi
|
||
print(f" → {out_path.name} ({n_indi} INDI, {n_fam} FAM, {len(gedcom):,} bytes)")
|
||
|
||
print(f"\nDone. Total individuals across all families: {total_persons}")
|
||
print(f"Output directory: {OUT_DIR}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|