Add CSV export of mariage table and merged GEDCOM
- export_mariages_csv.py : extrait la table `mariage` depuis basesgen.sql en 4 fichiers CSV classés par type d'acte (BMS, CM+TABLE CM, TABLE MARIAGES, ETAT CIVIL) ; ajoute 4 colonnes dérivées PERE/MERE_EPOUX/EPOUSE_DECEDE(E) détectées via le suffixe '+' sur les noms de parents ; encodage UTF-8 (corrige la lecture latin-1 initiale qui produisait des artefacts type "Félix") - csv_export/ : 4 fichiers générés (380 892 enregistrements au total) - merge_gedcom.py : fusionne les GEDCOM individuels en renumérotant les identifiants INDI/FAM pour éviter les collisions - lignees.ged : fusion des 16 exports (4 299 individus, 1 484 familles) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,287 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Export de la table `mariage` depuis basesgen.sql vers 4 fichiers CSV
|
||||
classés par type d'acte :
|
||||
- mariages_religieux.csv (BMS)
|
||||
- contrats_de_mariages.csv (CM + TABLE CM)
|
||||
- tables_de_mariages.csv (TABLE MARIAGES)
|
||||
- etat_civil.csv (ETAT CIVIL)
|
||||
"""
|
||||
|
||||
import csv
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
|
||||
SQL_FILE = "/home/yannick/Téléchargements/basesgen.sql"
|
||||
OUTPUT_DIR = "/home/yannick/BasesCGL/csv_export"
|
||||
|
||||
# Colonnes telles qu'elles existent dans le SQL (26 colonnes)
|
||||
SQL_COLUMNS = [
|
||||
"CLE_MARIAGE", "JOUR_MARIAGE", "MOIS_MARIAGE", "ANNEE_MARIAGE",
|
||||
"NOM_EPOUX", "PRENOM_EPOUX", "AGE_EPOUX", "PROF_EPOUX",
|
||||
"DATE_NAISSANCE_EPOUX", "VILLE_EPOUX",
|
||||
"NOM_PERE_EPOUX", "NOM_MERE_EPOUX", "VEUF_EPOUX",
|
||||
"NOM_EPOUSE", "PRENOM_EPOUSE", "AGE_EPOUSE", "PROF_EPOUSE",
|
||||
"DATE_NAISSANCE_EPOUSE", "VILLE_EPOUSE",
|
||||
"NOM_PERE_EPOUSE", "NOM_MERE_EPOUSE", "VEUVE_EPOUSE",
|
||||
"OBSERVATION_MARIAGE", "LIEU_ACTE", "TYPE_ACTE_BRUT", "CODE_INSEE",
|
||||
]
|
||||
|
||||
# Colonnes en sortie CSV (30 colonnes) : 4 colonnes dérivées insérées après chaque paire de parents
|
||||
CSV_COLUMNS = [
|
||||
"CLE_MARIAGE", "JOUR_MARIAGE", "MOIS_MARIAGE", "ANNEE_MARIAGE",
|
||||
"NOM_EPOUX", "PRENOM_EPOUX", "AGE_EPOUX", "PROF_EPOUX",
|
||||
"DATE_NAISSANCE_EPOUX", "VILLE_EPOUX",
|
||||
"NOM_PERE_EPOUX", "PERE_EPOUX_DECEDE", "NOM_MERE_EPOUX", "MERE_EPOUX_DECEDEE", "VEUF_EPOUX",
|
||||
"NOM_EPOUSE", "PRENOM_EPOUSE", "AGE_EPOUSE", "PROF_EPOUSE",
|
||||
"DATE_NAISSANCE_EPOUSE", "VILLE_EPOUSE",
|
||||
"NOM_PERE_EPOUSE", "PERE_EPOUSE_DECEDE", "NOM_MERE_EPOUSE", "MERE_EPOUSE_DECEDEE", "VEUVE_EPOUSE",
|
||||
"OBSERVATION_MARIAGE", "LIEU_ACTE", "TYPE_ACTE_BRUT", "CODE_INSEE",
|
||||
]
|
||||
|
||||
TYPE_ACTE_IDX = 24 # index de TYPE_ACTE dans la ligne SQL parsée (0-based)
|
||||
|
||||
# Indices SQL des noms de parents (pour détecter le '+')
|
||||
IDX_PERE_EPOUX = 10
|
||||
IDX_MERE_EPOUX = 11
|
||||
IDX_PERE_EPOUSE = 19
|
||||
IDX_MERE_EPOUSE = 20
|
||||
|
||||
# Mapping catégorie → nom de fichier
|
||||
OUTPUT_FILES = {
|
||||
"BMS": "mariages_religieux.csv",
|
||||
"CM": "contrats_de_mariages.csv",
|
||||
"TABLES": "tables_de_mariages.csv",
|
||||
"ETAT_CIVIL": "etat_civil.csv",
|
||||
}
|
||||
|
||||
|
||||
def classify(type_acte: str) -> str | None:
|
||||
# Normaliser les espaces multiples pour la classification
|
||||
import re as _re
|
||||
ta = _re.sub(r'\s+', ' ', type_acte.strip()).upper()
|
||||
if ta.startswith("BMS"):
|
||||
return "BMS"
|
||||
if ta.startswith("ETAT CIVIL") or ta.startswith("ETAT CILVIL"):
|
||||
return "ETAT_CIVIL"
|
||||
if ta.startswith("TABLE CM") or ta.startswith("TABLE C"):
|
||||
return "CM" # fusionné avec Contrats de Mariages
|
||||
if ta.startswith("TABLE"):
|
||||
return "TABLES" # TABLE MARIAGES, TABLE PUBLICATIONS DE MARIAGES, etc.
|
||||
if ta.startswith("CM"):
|
||||
return "CM"
|
||||
return None
|
||||
|
||||
|
||||
def parse_mysql_value(src: str, pos: int) -> tuple[str, int]:
|
||||
"""Parse a single MySQL value starting at pos; return (value_str, next_pos)."""
|
||||
# skip whitespace
|
||||
while pos < len(src) and src[pos] in (' ', '\t'):
|
||||
pos += 1
|
||||
|
||||
if pos >= len(src):
|
||||
return ("", pos)
|
||||
|
||||
if src[pos] == "'":
|
||||
# quoted string
|
||||
pos += 1
|
||||
buf = []
|
||||
while pos < len(src):
|
||||
ch = src[pos]
|
||||
if ch == '\\' and pos + 1 < len(src):
|
||||
nxt = src[pos + 1]
|
||||
if nxt == "'":
|
||||
buf.append("'")
|
||||
elif nxt == '\\':
|
||||
buf.append('\\')
|
||||
elif nxt == 'n':
|
||||
buf.append('\n')
|
||||
elif nxt == 'r':
|
||||
buf.append('\r')
|
||||
elif nxt == 't':
|
||||
buf.append('\t')
|
||||
else:
|
||||
buf.append(nxt)
|
||||
pos += 2
|
||||
elif ch == "'":
|
||||
pos += 1
|
||||
break
|
||||
else:
|
||||
buf.append(ch)
|
||||
pos += 1
|
||||
return (''.join(buf), pos)
|
||||
|
||||
if src[pos:pos + 4] == 'NULL':
|
||||
return ('', pos + 4)
|
||||
|
||||
# integer (possibly negative)
|
||||
j = pos
|
||||
if j < len(src) and src[j] == '-':
|
||||
j += 1
|
||||
while j < len(src) and src[j].isdigit():
|
||||
j += 1
|
||||
return (src[pos:j], j)
|
||||
|
||||
|
||||
def parse_row(line: str) -> list[str] | None:
|
||||
"""Parse one INSERT tuple line into a list of string values, or None if not a data row."""
|
||||
line = line.strip()
|
||||
if not line.startswith('('):
|
||||
return None
|
||||
# strip trailing ), or );
|
||||
if line.endswith(');'):
|
||||
inner = line[1:-2]
|
||||
elif line.endswith('),'):
|
||||
inner = line[1:-2]
|
||||
elif line.endswith(')'):
|
||||
inner = line[1:-1]
|
||||
else:
|
||||
return None
|
||||
|
||||
values = []
|
||||
pos = 0
|
||||
while pos < len(inner):
|
||||
# skip whitespace and commas between values
|
||||
while pos < len(inner) and inner[pos] in (' ', '\t'):
|
||||
pos += 1
|
||||
if pos >= len(inner):
|
||||
break
|
||||
if inner[pos] == ',':
|
||||
pos += 1
|
||||
continue
|
||||
val, pos = parse_mysql_value(inner, pos)
|
||||
values.append(val)
|
||||
|
||||
return values if len(values) == len(SQL_COLUMNS) else None
|
||||
|
||||
|
||||
def deceased_flag(name: str) -> str:
|
||||
"""Renvoie '1' si le nom se termine par '+', '' sinon."""
|
||||
return "1" if name.strip().endswith("+") else ""
|
||||
|
||||
|
||||
def enrich_row(sql_row: list[str]) -> list[str]:
|
||||
"""Insère les 4 colonnes dérivées dans la liste SQL (26 → 30 colonnes)."""
|
||||
r = sql_row
|
||||
return [
|
||||
# Époux (indices SQL 0-9 inchangés)
|
||||
r[0], r[1], r[2], r[3],
|
||||
r[4], r[5], r[6], r[7], r[8], r[9],
|
||||
# Parents époux + flags décès
|
||||
r[IDX_PERE_EPOUX], deceased_flag(r[IDX_PERE_EPOUX]),
|
||||
r[IDX_MERE_EPOUX], deceased_flag(r[IDX_MERE_EPOUX]),
|
||||
r[12], # VEUF_EPOUX
|
||||
# Épouse (SQL 13-18)
|
||||
r[13], r[14], r[15], r[16], r[17], r[18],
|
||||
# Parents épouse + flags décès
|
||||
r[IDX_PERE_EPOUSE], deceased_flag(r[IDX_PERE_EPOUSE]),
|
||||
r[IDX_MERE_EPOUSE], deceased_flag(r[IDX_MERE_EPOUSE]),
|
||||
r[21], # VEUVE_EPOUSE
|
||||
# Reste (SQL 22-25)
|
||||
r[22], r[23], r[24], r[25],
|
||||
]
|
||||
|
||||
|
||||
def main():
|
||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||
|
||||
file_size = os.path.getsize(SQL_FILE)
|
||||
|
||||
# Ouvrir les 4 CSV en sortie (UTF-8 avec BOM pour compatibilité Excel)
|
||||
handles = {}
|
||||
writers = {}
|
||||
for key, fname in OUTPUT_FILES.items():
|
||||
path = os.path.join(OUTPUT_DIR, fname)
|
||||
fh = open(path, 'w', encoding='utf-8-sig', newline='')
|
||||
writer = csv.writer(fh, quoting=csv.QUOTE_ALL)
|
||||
writer.writerow(CSV_COLUMNS)
|
||||
handles[key] = fh
|
||||
writers[key] = writer
|
||||
|
||||
counts = {k: 0 for k in OUTPUT_FILES}
|
||||
skipped = 0
|
||||
anomalies = []
|
||||
|
||||
start = time.time()
|
||||
bytes_read = 0
|
||||
in_mariage_insert = False
|
||||
last_pct = -1
|
||||
|
||||
print(f"Lecture de {SQL_FILE} ({file_size / 1e9:.2f} Go)…")
|
||||
|
||||
with open(SQL_FILE, encoding='utf-8', errors='replace') as f:
|
||||
for line in f:
|
||||
bytes_read += len(line.encode('utf-8', errors='replace'))
|
||||
|
||||
# Affichage de la progression tous les 5 %
|
||||
pct = int(bytes_read * 100 / file_size)
|
||||
if pct != last_pct and pct % 5 == 0:
|
||||
elapsed = time.time() - start
|
||||
total_est = int(elapsed / max(bytes_read, 1) * file_size)
|
||||
remaining = max(0, total_est - int(elapsed))
|
||||
print(f" {pct:3d}% ({bytes_read / 1e9:.2f} Go) "
|
||||
f"~{remaining // 60}m{remaining % 60:02d}s restant", flush=True)
|
||||
last_pct = pct
|
||||
|
||||
# Détection des blocs INSERT de la table mariage
|
||||
if 'INSERT INTO `mariage`' in line:
|
||||
in_mariage_insert = True
|
||||
continue
|
||||
|
||||
if not in_mariage_insert:
|
||||
continue
|
||||
|
||||
# Fin du bloc INSERT
|
||||
if line.strip() == '' or line.strip().startswith('--') or line.strip().startswith('/*!'):
|
||||
if not line.strip().startswith('('):
|
||||
in_mariage_insert = False
|
||||
continue
|
||||
|
||||
row = parse_row(line)
|
||||
|
||||
if row is None:
|
||||
if line.strip().startswith('('):
|
||||
skipped += 1
|
||||
anomalies.append(line.strip()[:120])
|
||||
continue
|
||||
|
||||
type_acte = row[TYPE_ACTE_IDX]
|
||||
cat = classify(type_acte)
|
||||
|
||||
if cat is None:
|
||||
skipped += 1
|
||||
anomalies.append(f"TYPE_ACTE inconnu: {repr(type_acte)}")
|
||||
continue
|
||||
|
||||
writers[cat].writerow(enrich_row(row))
|
||||
counts[cat] += 1
|
||||
|
||||
# Fin de bloc (dernière ligne se termine par ';')
|
||||
if line.rstrip().endswith(';'):
|
||||
in_mariage_insert = False
|
||||
|
||||
for fh in handles.values():
|
||||
fh.close()
|
||||
|
||||
elapsed = int(time.time() - start)
|
||||
total = sum(counts.values())
|
||||
|
||||
print(f"\n{'='*55}")
|
||||
print(f"Export terminé en {elapsed // 60}m{elapsed % 60:02d}s")
|
||||
print(f"{'='*55}")
|
||||
for key, fname in OUTPUT_FILES.items():
|
||||
path = os.path.join(OUTPUT_DIR, fname)
|
||||
size_kb = os.path.getsize(path) // 1024
|
||||
print(f" {fname:<35s} {counts[key]:>7,} lignes ({size_kb:>6,} Ko)")
|
||||
print(f" {'TOTAL':<35s} {total:>7,} lignes")
|
||||
if skipped:
|
||||
print(f"\n ⚠ {skipped} enregistrement(s) ignoré(s) :")
|
||||
for a in anomalies[:10]:
|
||||
print(f" {a}")
|
||||
if len(anomalies) > 10:
|
||||
print(f" … et {len(anomalies) - 10} autres")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
+54599
File diff suppressed because it is too large
Load Diff
+113
@@ -0,0 +1,113 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Merge multiple GEDCOM files into one, renumbering INDI and FAM IDs to avoid collisions."""
|
||||
|
||||
import glob
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
|
||||
INPUT_DIR = "gedcom_output"
|
||||
OUTPUT_FILE = "lignees.ged"
|
||||
|
||||
def parse_gedcom(path):
|
||||
"""Return (header_note, records) where records is a list of raw record strings."""
|
||||
with open(path, encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
|
||||
lines = content.splitlines()
|
||||
records = []
|
||||
current = []
|
||||
header_note = ""
|
||||
|
||||
for line in lines:
|
||||
if line.startswith("0 "):
|
||||
if current:
|
||||
tag = current[0]
|
||||
if tag == "0 HEAD":
|
||||
# Extract the NOTE line for source tracking
|
||||
for l in current:
|
||||
if l.startswith("1 NOTE "):
|
||||
header_note = l[7:]
|
||||
elif tag != "0 TRLR":
|
||||
records.append("\n".join(current))
|
||||
current = [line]
|
||||
else:
|
||||
current.append(line)
|
||||
|
||||
if current and current[0] not in ("0 TRLR", "0 HEAD"):
|
||||
records.append("\n".join(current))
|
||||
|
||||
return header_note, records
|
||||
|
||||
|
||||
def renumber_records(records, indi_offset, fam_offset):
|
||||
"""Replace @Innnnn@ and @Fnnnnn@ references with offset-adjusted IDs."""
|
||||
def replace_id(m):
|
||||
kind = m.group(1)
|
||||
num = int(m.group(2))
|
||||
if kind == "I":
|
||||
return f"@I{num + indi_offset:04d}@"
|
||||
else:
|
||||
return f"@F{num + fam_offset:04d}@"
|
||||
|
||||
result = []
|
||||
for record in records:
|
||||
renumbered = re.sub(r"@([IF])(\d+)@", replace_id, record)
|
||||
result.append(renumbered)
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
ged_files = sorted(glob.glob(os.path.join(INPUT_DIR, "*.ged")))
|
||||
if not ged_files:
|
||||
print(f"No .ged files found in {INPUT_DIR}/", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Found {len(ged_files)} GEDCOM files to merge.")
|
||||
|
||||
all_records = []
|
||||
indi_offset = 0
|
||||
fam_offset = 0
|
||||
sources = []
|
||||
|
||||
for path in ged_files:
|
||||
basename = os.path.basename(path)
|
||||
note, records = parse_gedcom(path)
|
||||
sources.append(note or basename)
|
||||
|
||||
indi_count = sum(1 for r in records if r.startswith("0 @I"))
|
||||
fam_count = sum(1 for r in records if r.startswith("0 @F"))
|
||||
|
||||
renumbered = renumber_records(records, indi_offset, fam_offset)
|
||||
all_records.extend(renumbered)
|
||||
|
||||
print(f" {basename}: {indi_count} INDI, {fam_count} FAM (offset I+{indi_offset}, F+{fam_offset})")
|
||||
indi_offset += indi_count
|
||||
fam_offset += fam_count
|
||||
|
||||
total_indi = sum(1 for r in all_records if r.startswith("0 @I"))
|
||||
total_fam = sum(1 for r in all_records if r.startswith("0 @F"))
|
||||
print(f"\nTotal: {total_indi} INDI, {total_fam} FAM → {OUTPUT_FILE}")
|
||||
|
||||
with open(OUTPUT_FILE, "w", encoding="utf-8") as out:
|
||||
out.write("0 HEAD\n")
|
||||
out.write("1 SOUR BaseCGL\n")
|
||||
out.write("2 NAME CGL Bases généalogiques du Languedoc – basesgen.sql\n")
|
||||
out.write("1 GEDC\n")
|
||||
out.write("2 VERS 5.5.1\n")
|
||||
out.write("2 FORM LINEAGE-LINKED\n")
|
||||
out.write("1 CHAR UTF-8\n")
|
||||
out.write("1 NOTE Lignées CGL – fusion de 16 exports GEDCOM\n")
|
||||
for src in sources:
|
||||
out.write(f"2 CONT {src}\n")
|
||||
out.write("\n")
|
||||
for record in all_records:
|
||||
out.write(record)
|
||||
out.write("\n\n")
|
||||
out.write("0 TRLR\n")
|
||||
|
||||
print("Done.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user