#!/usr/bin/env python3 """ Export de la table `mariage` depuis basesgen.sql vers 4 fichiers CSV classés par type d'acte : - mariages_religieux.csv (BMS) - contrats_de_mariages.csv (CM + TABLE CM) - tables_de_mariages.csv (TABLE MARIAGES) - etat_civil.csv (ETAT CIVIL) """ import csv import os import sys import time SQL_FILE = "/home/yannick/Téléchargements/basesgen.sql" OUTPUT_DIR = "/home/yannick/BasesCGL/csv_export" # Colonnes telles qu'elles existent dans le SQL (26 colonnes) SQL_COLUMNS = [ "CLE_MARIAGE", "JOUR_MARIAGE", "MOIS_MARIAGE", "ANNEE_MARIAGE", "NOM_EPOUX", "PRENOM_EPOUX", "AGE_EPOUX", "PROF_EPOUX", "DATE_NAISSANCE_EPOUX", "VILLE_EPOUX", "NOM_PERE_EPOUX", "NOM_MERE_EPOUX", "VEUF_EPOUX", "NOM_EPOUSE", "PRENOM_EPOUSE", "AGE_EPOUSE", "PROF_EPOUSE", "DATE_NAISSANCE_EPOUSE", "VILLE_EPOUSE", "NOM_PERE_EPOUSE", "NOM_MERE_EPOUSE", "VEUVE_EPOUSE", "OBSERVATION_MARIAGE", "LIEU_ACTE", "TYPE_ACTE_BRUT", "CODE_INSEE", ] # Colonnes en sortie CSV (30 colonnes) : 4 colonnes dérivées insérées après chaque paire de parents CSV_COLUMNS = [ "CLE_MARIAGE", "JOUR_MARIAGE", "MOIS_MARIAGE", "ANNEE_MARIAGE", "NOM_EPOUX", "PRENOM_EPOUX", "AGE_EPOUX", "PROF_EPOUX", "DATE_NAISSANCE_EPOUX", "VILLE_EPOUX", "NOM_PERE_EPOUX", "PERE_EPOUX_DECEDE", "NOM_MERE_EPOUX", "MERE_EPOUX_DECEDEE", "VEUF_EPOUX", "NOM_EPOUSE", "PRENOM_EPOUSE", "AGE_EPOUSE", "PROF_EPOUSE", "DATE_NAISSANCE_EPOUSE", "VILLE_EPOUSE", "NOM_PERE_EPOUSE", "PERE_EPOUSE_DECEDE", "NOM_MERE_EPOUSE", "MERE_EPOUSE_DECEDEE", "VEUVE_EPOUSE", "OBSERVATION_MARIAGE", "LIEU_ACTE", "TYPE_ACTE_BRUT", "CODE_INSEE", ] TYPE_ACTE_IDX = 24 # index de TYPE_ACTE dans la ligne SQL parsée (0-based) # Indices SQL des noms de parents (pour détecter le '+') IDX_PERE_EPOUX = 10 IDX_MERE_EPOUX = 11 IDX_PERE_EPOUSE = 19 IDX_MERE_EPOUSE = 20 # Mapping catégorie → nom de fichier OUTPUT_FILES = { "BMS": "mariages_religieux.csv", "CM": "contrats_de_mariages.csv", "TABLES": "tables_de_mariages.csv", "ETAT_CIVIL": "etat_civil.csv", } def classify(type_acte: str) -> str | None: # Normaliser les espaces multiples pour la classification import re as _re ta = _re.sub(r'\s+', ' ', type_acte.strip()).upper() if ta.startswith("BMS"): return "BMS" if ta.startswith("ETAT CIVIL") or ta.startswith("ETAT CILVIL"): return "ETAT_CIVIL" if ta.startswith("TABLE CM") or ta.startswith("TABLE C"): return "CM" # fusionné avec Contrats de Mariages if ta.startswith("TABLE"): return "TABLES" # TABLE MARIAGES, TABLE PUBLICATIONS DE MARIAGES, etc. if ta.startswith("CM"): return "CM" return None def parse_mysql_value(src: str, pos: int) -> tuple[str, int]: """Parse a single MySQL value starting at pos; return (value_str, next_pos).""" # skip whitespace while pos < len(src) and src[pos] in (' ', '\t'): pos += 1 if pos >= len(src): return ("", pos) if src[pos] == "'": # quoted string pos += 1 buf = [] while pos < len(src): ch = src[pos] if ch == '\\' and pos + 1 < len(src): nxt = src[pos + 1] if nxt == "'": buf.append("'") elif nxt == '\\': buf.append('\\') elif nxt == 'n': buf.append('\n') elif nxt == 'r': buf.append('\r') elif nxt == 't': buf.append('\t') else: buf.append(nxt) pos += 2 elif ch == "'": pos += 1 break else: buf.append(ch) pos += 1 return (''.join(buf), pos) if src[pos:pos + 4] == 'NULL': return ('', pos + 4) # integer (possibly negative) j = pos if j < len(src) and src[j] == '-': j += 1 while j < len(src) and src[j].isdigit(): j += 1 return (src[pos:j], j) def parse_row(line: str) -> list[str] | None: """Parse one INSERT tuple line into a list of string values, or None if not a data row.""" line = line.strip() if not line.startswith('('): return None # strip trailing ), or ); if line.endswith(');'): inner = line[1:-2] elif line.endswith('),'): inner = line[1:-2] elif line.endswith(')'): inner = line[1:-1] else: return None values = [] pos = 0 while pos < len(inner): # skip whitespace and commas between values while pos < len(inner) and inner[pos] in (' ', '\t'): pos += 1 if pos >= len(inner): break if inner[pos] == ',': pos += 1 continue val, pos = parse_mysql_value(inner, pos) values.append(val) return values if len(values) == len(SQL_COLUMNS) else None def deceased_flag(name: str) -> str: """Renvoie '1' si le nom se termine par '+', '' sinon.""" return "1" if name.strip().endswith("+") else "" def enrich_row(sql_row: list[str]) -> list[str]: """Insère les 4 colonnes dérivées dans la liste SQL (26 → 30 colonnes).""" r = sql_row return [ # Époux (indices SQL 0-9 inchangés) r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7], r[8], r[9], # Parents époux + flags décès r[IDX_PERE_EPOUX], deceased_flag(r[IDX_PERE_EPOUX]), r[IDX_MERE_EPOUX], deceased_flag(r[IDX_MERE_EPOUX]), r[12], # VEUF_EPOUX # Épouse (SQL 13-18) r[13], r[14], r[15], r[16], r[17], r[18], # Parents épouse + flags décès r[IDX_PERE_EPOUSE], deceased_flag(r[IDX_PERE_EPOUSE]), r[IDX_MERE_EPOUSE], deceased_flag(r[IDX_MERE_EPOUSE]), r[21], # VEUVE_EPOUSE # Reste (SQL 22-25) r[22], r[23], r[24], r[25], ] def main(): os.makedirs(OUTPUT_DIR, exist_ok=True) file_size = os.path.getsize(SQL_FILE) # Ouvrir les 4 CSV en sortie (UTF-8 avec BOM pour compatibilité Excel) handles = {} writers = {} for key, fname in OUTPUT_FILES.items(): path = os.path.join(OUTPUT_DIR, fname) fh = open(path, 'w', encoding='utf-8-sig', newline='') writer = csv.writer(fh, quoting=csv.QUOTE_ALL) writer.writerow(CSV_COLUMNS) handles[key] = fh writers[key] = writer counts = {k: 0 for k in OUTPUT_FILES} skipped = 0 anomalies = [] start = time.time() bytes_read = 0 in_mariage_insert = False last_pct = -1 print(f"Lecture de {SQL_FILE} ({file_size / 1e9:.2f} Go)…") with open(SQL_FILE, encoding='utf-8', errors='replace') as f: for line in f: bytes_read += len(line.encode('utf-8', errors='replace')) # Affichage de la progression tous les 5 % pct = int(bytes_read * 100 / file_size) if pct != last_pct and pct % 5 == 0: elapsed = time.time() - start total_est = int(elapsed / max(bytes_read, 1) * file_size) remaining = max(0, total_est - int(elapsed)) print(f" {pct:3d}% ({bytes_read / 1e9:.2f} Go) " f"~{remaining // 60}m{remaining % 60:02d}s restant", flush=True) last_pct = pct # Détection des blocs INSERT de la table mariage if 'INSERT INTO `mariage`' in line: in_mariage_insert = True continue if not in_mariage_insert: continue # Fin du bloc INSERT if line.strip() == '' or line.strip().startswith('--') or line.strip().startswith('/*!'): if not line.strip().startswith('('): in_mariage_insert = False continue row = parse_row(line) if row is None: if line.strip().startswith('('): skipped += 1 anomalies.append(line.strip()[:120]) continue type_acte = row[TYPE_ACTE_IDX] cat = classify(type_acte) if cat is None: skipped += 1 anomalies.append(f"TYPE_ACTE inconnu: {repr(type_acte)}") continue writers[cat].writerow(enrich_row(row)) counts[cat] += 1 # Fin de bloc (dernière ligne se termine par ';') if line.rstrip().endswith(';'): in_mariage_insert = False for fh in handles.values(): fh.close() elapsed = int(time.time() - start) total = sum(counts.values()) print(f"\n{'='*55}") print(f"Export terminé en {elapsed // 60}m{elapsed % 60:02d}s") print(f"{'='*55}") for key, fname in OUTPUT_FILES.items(): path = os.path.join(OUTPUT_DIR, fname) size_kb = os.path.getsize(path) // 1024 print(f" {fname:<35s} {counts[key]:>7,} lignes ({size_kb:>6,} Ko)") print(f" {'TOTAL':<35s} {total:>7,} lignes") if skipped: print(f"\n ⚠ {skipped} enregistrement(s) ignoré(s) :") for a in anomalies[:10]: print(f" {a}") if len(anomalies) > 10: print(f" … et {len(anomalies) - 10} autres") if __name__ == "__main__": main()