[In-Depth Guide] The Complete CTGAN + SDV Pipeline for High-Fidelity Synthetic Data

[In-Depth Guide] The Complete CTGAN + SDV Pipeline for High-Fidelity Synthetic Data


metadata_dict = metadata.to_dict()

diagnostic = DiagnosticReport()
diagnostic.generate(real_data=real, synthetic_data=synthetic_sdv, metadata=metadata_dict, verbose=True)
print(“Diagnostic score:”, diagnostic.get_score())

quality = QualityReport()
quality.generate(real_data=real, synthetic_data=synthetic_sdv, metadata=metadata_dict, verbose=True)
print(“Quality score:”, quality.get_score())

def show_report_details(report, title):
print(f”\n===== {title} details =====”)
props = report.get_properties()
for p in props:
print(f”\n— {p} —“)
details = report.get_details(property_name=p)
try:
display(details.head(10))
except Exception:
display(details)

okex

show_report_details(diagnostic, “DiagnosticReport”)
show_report_details(quality, “QualityReport”)

train_real, test_real = train_test_split(
real, test_size=0.25, random_state=42, stratify=real[target_col]
)

def make_pipeline(cat_cols, num_cols):
pre = ColumnTransformer(
transformers=[
(“cat”, OneHotEncoder(handle_unknown=”ignore”), cat_cols),
(“num”, “passthrough”, num_cols),
],
remainder=”drop”
)
clf = LogisticRegression(max_iter=200)
return Pipeline([(“pre”, pre), (“clf”, clf)])

pipe_syn = make_pipeline(categorical_cols, numerical_cols)
pipe_syn.fit(synthetic_sdv.drop(columns=[target_col]), synthetic_sdv[target_col])

proba_syn = pipe_syn.predict_proba(test_real.drop(columns=[target_col]))[:, 1]
y_true = (test_real[target_col].astype(str).str.contains(“>”)).astype(int)
auc_syn = roc_auc_score(y_true, proba_syn)
print(“Synthetic-train -> Real-test AUC:”, auc_syn)

pipe_real = make_pipeline(categorical_cols, numerical_cols)
pipe_real.fit(train_real.drop(columns=[target_col]), train_real[target_col])

proba_real = pipe_real.predict_proba(test_real.drop(columns=[target_col]))[:, 1]
auc_real = roc_auc_score(y_true, proba_real)
print(“Real-train -> Real-test AUC:”, auc_real)

model_path = “ctgan_sdv_synth.pkl”
synth.save(model_path)
print(“Saved synthesizer to:”, model_path)

from sdv.utils import load_synthesizer
synth_loaded = load_synthesizer(model_path)

synthetic_loaded = synth_loaded.sample(1000)
print(“Loaded synthesizer sample:”)
display(synthetic_loaded.head())



Source link

Leave a Reply

Your email address will not be published. Required fields are marked *

Pin It on Pinterest