Add CSV export of mariage table and merged GEDCOM

- export_mariages_csv.py : extrait la table `mariage` depuis basesgen.sql
  en 4 fichiers CSV classés par type d'acte (BMS, CM+TABLE CM,
  TABLE MARIAGES, ETAT CIVIL) ; ajoute 4 colonnes dérivées
  PERE/MERE_EPOUX/EPOUSE_DECEDE(E) détectées via le suffixe '+' sur les
  noms de parents ; encodage UTF-8 (corrige la lecture latin-1 initiale
  qui produisait des artefacts type "Félix")
- csv_export/ : 4 fichiers générés (380 892 enregistrements au total)
- merge_gedcom.py : fusionne les GEDCOM individuels en renumérotant les
  identifiants INDI/FAM pour éviter les collisions
- lignees.ged : fusion des 16 exports (4 299 individus, 1 484 familles)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-28 15:22:28 +02:00
parent f75cbebb44
commit e467b9662a
7 changed files with 435895 additions and 0 deletions
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
+287
View File
@@ -0,0 +1,287 @@
#!/usr/bin/env python3
"""
Export de la table `mariage` depuis basesgen.sql vers 4 fichiers CSV
classés par type d'acte :
- mariages_religieux.csv (BMS)
- contrats_de_mariages.csv (CM + TABLE CM)
- tables_de_mariages.csv (TABLE MARIAGES)
- etat_civil.csv (ETAT CIVIL)
"""
import csv
import os
import sys
import time
SQL_FILE = "/home/yannick/Téléchargements/basesgen.sql"
OUTPUT_DIR = "/home/yannick/BasesCGL/csv_export"
# Colonnes telles qu'elles existent dans le SQL (26 colonnes)
SQL_COLUMNS = [
"CLE_MARIAGE", "JOUR_MARIAGE", "MOIS_MARIAGE", "ANNEE_MARIAGE",
"NOM_EPOUX", "PRENOM_EPOUX", "AGE_EPOUX", "PROF_EPOUX",
"DATE_NAISSANCE_EPOUX", "VILLE_EPOUX",
"NOM_PERE_EPOUX", "NOM_MERE_EPOUX", "VEUF_EPOUX",
"NOM_EPOUSE", "PRENOM_EPOUSE", "AGE_EPOUSE", "PROF_EPOUSE",
"DATE_NAISSANCE_EPOUSE", "VILLE_EPOUSE",
"NOM_PERE_EPOUSE", "NOM_MERE_EPOUSE", "VEUVE_EPOUSE",
"OBSERVATION_MARIAGE", "LIEU_ACTE", "TYPE_ACTE_BRUT", "CODE_INSEE",
]
# Colonnes en sortie CSV (30 colonnes) : 4 colonnes dérivées insérées après chaque paire de parents
CSV_COLUMNS = [
"CLE_MARIAGE", "JOUR_MARIAGE", "MOIS_MARIAGE", "ANNEE_MARIAGE",
"NOM_EPOUX", "PRENOM_EPOUX", "AGE_EPOUX", "PROF_EPOUX",
"DATE_NAISSANCE_EPOUX", "VILLE_EPOUX",
"NOM_PERE_EPOUX", "PERE_EPOUX_DECEDE", "NOM_MERE_EPOUX", "MERE_EPOUX_DECEDEE", "VEUF_EPOUX",
"NOM_EPOUSE", "PRENOM_EPOUSE", "AGE_EPOUSE", "PROF_EPOUSE",
"DATE_NAISSANCE_EPOUSE", "VILLE_EPOUSE",
"NOM_PERE_EPOUSE", "PERE_EPOUSE_DECEDE", "NOM_MERE_EPOUSE", "MERE_EPOUSE_DECEDEE", "VEUVE_EPOUSE",
"OBSERVATION_MARIAGE", "LIEU_ACTE", "TYPE_ACTE_BRUT", "CODE_INSEE",
]
TYPE_ACTE_IDX = 24 # index de TYPE_ACTE dans la ligne SQL parsée (0-based)
# Indices SQL des noms de parents (pour détecter le '+')
IDX_PERE_EPOUX = 10
IDX_MERE_EPOUX = 11
IDX_PERE_EPOUSE = 19
IDX_MERE_EPOUSE = 20
# Mapping catégorie → nom de fichier
OUTPUT_FILES = {
"BMS": "mariages_religieux.csv",
"CM": "contrats_de_mariages.csv",
"TABLES": "tables_de_mariages.csv",
"ETAT_CIVIL": "etat_civil.csv",
}
def classify(type_acte: str) -> str | None:
# Normaliser les espaces multiples pour la classification
import re as _re
ta = _re.sub(r'\s+', ' ', type_acte.strip()).upper()
if ta.startswith("BMS"):
return "BMS"
if ta.startswith("ETAT CIVIL") or ta.startswith("ETAT CILVIL"):
return "ETAT_CIVIL"
if ta.startswith("TABLE CM") or ta.startswith("TABLE C"):
return "CM" # fusionné avec Contrats de Mariages
if ta.startswith("TABLE"):
return "TABLES" # TABLE MARIAGES, TABLE PUBLICATIONS DE MARIAGES, etc.
if ta.startswith("CM"):
return "CM"
return None
def parse_mysql_value(src: str, pos: int) -> tuple[str, int]:
"""Parse a single MySQL value starting at pos; return (value_str, next_pos)."""
# skip whitespace
while pos < len(src) and src[pos] in (' ', '\t'):
pos += 1
if pos >= len(src):
return ("", pos)
if src[pos] == "'":
# quoted string
pos += 1
buf = []
while pos < len(src):
ch = src[pos]
if ch == '\\' and pos + 1 < len(src):
nxt = src[pos + 1]
if nxt == "'":
buf.append("'")
elif nxt == '\\':
buf.append('\\')
elif nxt == 'n':
buf.append('\n')
elif nxt == 'r':
buf.append('\r')
elif nxt == 't':
buf.append('\t')
else:
buf.append(nxt)
pos += 2
elif ch == "'":
pos += 1
break
else:
buf.append(ch)
pos += 1
return (''.join(buf), pos)
if src[pos:pos + 4] == 'NULL':
return ('', pos + 4)
# integer (possibly negative)
j = pos
if j < len(src) and src[j] == '-':
j += 1
while j < len(src) and src[j].isdigit():
j += 1
return (src[pos:j], j)
def parse_row(line: str) -> list[str] | None:
"""Parse one INSERT tuple line into a list of string values, or None if not a data row."""
line = line.strip()
if not line.startswith('('):
return None
# strip trailing ), or );
if line.endswith(');'):
inner = line[1:-2]
elif line.endswith('),'):
inner = line[1:-2]
elif line.endswith(')'):
inner = line[1:-1]
else:
return None
values = []
pos = 0
while pos < len(inner):
# skip whitespace and commas between values
while pos < len(inner) and inner[pos] in (' ', '\t'):
pos += 1
if pos >= len(inner):
break
if inner[pos] == ',':
pos += 1
continue
val, pos = parse_mysql_value(inner, pos)
values.append(val)
return values if len(values) == len(SQL_COLUMNS) else None
def deceased_flag(name: str) -> str:
"""Renvoie '1' si le nom se termine par '+', '' sinon."""
return "1" if name.strip().endswith("+") else ""
def enrich_row(sql_row: list[str]) -> list[str]:
"""Insère les 4 colonnes dérivées dans la liste SQL (26 → 30 colonnes)."""
r = sql_row
return [
# Époux (indices SQL 0-9 inchangés)
r[0], r[1], r[2], r[3],
r[4], r[5], r[6], r[7], r[8], r[9],
# Parents époux + flags décès
r[IDX_PERE_EPOUX], deceased_flag(r[IDX_PERE_EPOUX]),
r[IDX_MERE_EPOUX], deceased_flag(r[IDX_MERE_EPOUX]),
r[12], # VEUF_EPOUX
# Épouse (SQL 13-18)
r[13], r[14], r[15], r[16], r[17], r[18],
# Parents épouse + flags décès
r[IDX_PERE_EPOUSE], deceased_flag(r[IDX_PERE_EPOUSE]),
r[IDX_MERE_EPOUSE], deceased_flag(r[IDX_MERE_EPOUSE]),
r[21], # VEUVE_EPOUSE
# Reste (SQL 22-25)
r[22], r[23], r[24], r[25],
]
def main():
os.makedirs(OUTPUT_DIR, exist_ok=True)
file_size = os.path.getsize(SQL_FILE)
# Ouvrir les 4 CSV en sortie (UTF-8 avec BOM pour compatibilité Excel)
handles = {}
writers = {}
for key, fname in OUTPUT_FILES.items():
path = os.path.join(OUTPUT_DIR, fname)
fh = open(path, 'w', encoding='utf-8-sig', newline='')
writer = csv.writer(fh, quoting=csv.QUOTE_ALL)
writer.writerow(CSV_COLUMNS)
handles[key] = fh
writers[key] = writer
counts = {k: 0 for k in OUTPUT_FILES}
skipped = 0
anomalies = []
start = time.time()
bytes_read = 0
in_mariage_insert = False
last_pct = -1
print(f"Lecture de {SQL_FILE} ({file_size / 1e9:.2f} Go)…")
with open(SQL_FILE, encoding='utf-8', errors='replace') as f:
for line in f:
bytes_read += len(line.encode('utf-8', errors='replace'))
# Affichage de la progression tous les 5 %
pct = int(bytes_read * 100 / file_size)
if pct != last_pct and pct % 5 == 0:
elapsed = time.time() - start
total_est = int(elapsed / max(bytes_read, 1) * file_size)
remaining = max(0, total_est - int(elapsed))
print(f" {pct:3d}% ({bytes_read / 1e9:.2f} Go) "
f"~{remaining // 60}m{remaining % 60:02d}s restant", flush=True)
last_pct = pct
# Détection des blocs INSERT de la table mariage
if 'INSERT INTO `mariage`' in line:
in_mariage_insert = True
continue
if not in_mariage_insert:
continue
# Fin du bloc INSERT
if line.strip() == '' or line.strip().startswith('--') or line.strip().startswith('/*!'):
if not line.strip().startswith('('):
in_mariage_insert = False
continue
row = parse_row(line)
if row is None:
if line.strip().startswith('('):
skipped += 1
anomalies.append(line.strip()[:120])
continue
type_acte = row[TYPE_ACTE_IDX]
cat = classify(type_acte)
if cat is None:
skipped += 1
anomalies.append(f"TYPE_ACTE inconnu: {repr(type_acte)}")
continue
writers[cat].writerow(enrich_row(row))
counts[cat] += 1
# Fin de bloc (dernière ligne se termine par ';')
if line.rstrip().endswith(';'):
in_mariage_insert = False
for fh in handles.values():
fh.close()
elapsed = int(time.time() - start)
total = sum(counts.values())
print(f"\n{'='*55}")
print(f"Export terminé en {elapsed // 60}m{elapsed % 60:02d}s")
print(f"{'='*55}")
for key, fname in OUTPUT_FILES.items():
path = os.path.join(OUTPUT_DIR, fname)
size_kb = os.path.getsize(path) // 1024
print(f" {fname:<35s} {counts[key]:>7,} lignes ({size_kb:>6,} Ko)")
print(f" {'TOTAL':<35s} {total:>7,} lignes")
if skipped:
print(f"\n{skipped} enregistrement(s) ignoré(s) :")
for a in anomalies[:10]:
print(f" {a}")
if len(anomalies) > 10:
print(f" … et {len(anomalies) - 10} autres")
if __name__ == "__main__":
main()
+54599
View File
File diff suppressed because it is too large Load Diff
+113
View File
@@ -0,0 +1,113 @@
#!/usr/bin/env python3
"""Merge multiple GEDCOM files into one, renumbering INDI and FAM IDs to avoid collisions."""
import glob
import os
import re
import sys
INPUT_DIR = "gedcom_output"
OUTPUT_FILE = "lignees.ged"
def parse_gedcom(path):
"""Return (header_note, records) where records is a list of raw record strings."""
with open(path, encoding="utf-8") as f:
content = f.read()
lines = content.splitlines()
records = []
current = []
header_note = ""
for line in lines:
if line.startswith("0 "):
if current:
tag = current[0]
if tag == "0 HEAD":
# Extract the NOTE line for source tracking
for l in current:
if l.startswith("1 NOTE "):
header_note = l[7:]
elif tag != "0 TRLR":
records.append("\n".join(current))
current = [line]
else:
current.append(line)
if current and current[0] not in ("0 TRLR", "0 HEAD"):
records.append("\n".join(current))
return header_note, records
def renumber_records(records, indi_offset, fam_offset):
"""Replace @Innnnn@ and @Fnnnnn@ references with offset-adjusted IDs."""
def replace_id(m):
kind = m.group(1)
num = int(m.group(2))
if kind == "I":
return f"@I{num + indi_offset:04d}@"
else:
return f"@F{num + fam_offset:04d}@"
result = []
for record in records:
renumbered = re.sub(r"@([IF])(\d+)@", replace_id, record)
result.append(renumbered)
return result
def main():
ged_files = sorted(glob.glob(os.path.join(INPUT_DIR, "*.ged")))
if not ged_files:
print(f"No .ged files found in {INPUT_DIR}/", file=sys.stderr)
sys.exit(1)
print(f"Found {len(ged_files)} GEDCOM files to merge.")
all_records = []
indi_offset = 0
fam_offset = 0
sources = []
for path in ged_files:
basename = os.path.basename(path)
note, records = parse_gedcom(path)
sources.append(note or basename)
indi_count = sum(1 for r in records if r.startswith("0 @I"))
fam_count = sum(1 for r in records if r.startswith("0 @F"))
renumbered = renumber_records(records, indi_offset, fam_offset)
all_records.extend(renumbered)
print(f" {basename}: {indi_count} INDI, {fam_count} FAM (offset I+{indi_offset}, F+{fam_offset})")
indi_offset += indi_count
fam_offset += fam_count
total_indi = sum(1 for r in all_records if r.startswith("0 @I"))
total_fam = sum(1 for r in all_records if r.startswith("0 @F"))
print(f"\nTotal: {total_indi} INDI, {total_fam} FAM → {OUTPUT_FILE}")
with open(OUTPUT_FILE, "w", encoding="utf-8") as out:
out.write("0 HEAD\n")
out.write("1 SOUR BaseCGL\n")
out.write("2 NAME CGL Bases généalogiques du Languedoc basesgen.sql\n")
out.write("1 GEDC\n")
out.write("2 VERS 5.5.1\n")
out.write("2 FORM LINEAGE-LINKED\n")
out.write("1 CHAR UTF-8\n")
out.write("1 NOTE Lignées CGL fusion de 16 exports GEDCOM\n")
for src in sources:
out.write(f"2 CONT {src}\n")
out.write("\n")
for record in all_records:
out.write(record)
out.write("\n\n")
out.write("0 TRLR\n")
print("Done.")
if __name__ == "__main__":
main()