binary-dockerfile-model / scripts /11.1_evaluate_binary.py

Add scripts

e9b8340 verified 9 months ago

2.46 kB

	# 11.1_evaluate_binary_v3.py

	from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer
	from datasets import load_from_disk
	from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	from pathlib import Path
	import json

	# === Ścieżki
	MODEL_DIR = Path("models/binary/final")
	DATASET_DIR = Path("data/processed/dataset_binary")
	OUT_DIR = MODEL_DIR

	REPORT_CSV = OUT_DIR / "classification_report.csv"
	REPORT_JSON = OUT_DIR / "metrics.json"
	CONF_MATRIX_PNG = OUT_DIR / "confusion_matrix.png"

	# === Wczytaj model
	print("📂 Wczytywanie modelu...")
	model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)

	# === Wczytaj tokenizer z modelu lub zapasowy
	tokenizer_files = list(MODEL_DIR.glob("tokenizer*"))
	if tokenizer_files:
	tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
	else:
	print("⚠️ Brak tokenizera w modelu — pobieram z microsoft/codebert-base")
	tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
	tokenizer.save_pretrained(MODEL_DIR)

	# === Wczytaj dane
	ds = load_from_disk(str(DATASET_DIR))
	trainer = Trainer(model=model)

	# === Predykcja
	print("🔍 Predykcja na zbiorze testowym...")
	predictions = trainer.predict(ds["test"])
	y_pred = np.argmax(predictions.predictions, axis=1)
	y_true = predictions.label_ids

	# === Raport klasyfikacji
	print("\n📊 Raport klasyfikacji:")
	report_dict = classification_report(
	y_true, y_pred, target_names=["good", "bad"], zero_division=0, output_dict=True
	)
	report_text = classification_report(
	y_true, y_pred, target_names=["good", "bad"], zero_division=0
	)
	print(report_text)

	# Zapis CSV + JSON
	df_report = pd.DataFrame(report_dict).transpose()
	df_report.to_csv(REPORT_CSV)
	with open(REPORT_JSON, "w") as f:
	json.dump(report_dict, f, indent=2)

	print(f"💾 Zapisano raport CSV: {REPORT_CSV}")
	print(f"💾 Zapisano metryki JSON: {REPORT_JSON}")

	# === Macierz pomyłek + wykres
	conf_matrix = confusion_matrix(y_true, y_pred)
	labels = ["good", "bad"]
	disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=labels)

	plt.figure(figsize=(5, 4))
	disp.plot(cmap="Purples", values_format="d")
	plt.title("🧱 Confusion Matrix – Binary Classifier")
	plt.grid(False)
	plt.tight_layout()
	plt.savefig(CONF_MATRIX_PNG)
	plt.close()

	print(f"🖼️ Zapisano confusion matrix jako PNG: {CONF_MATRIX_PNG}")

LeeSek
/

binary-dockerfile-model

binary-classification

codebert

Model card Files Files and versions

xet

Community

binary-dockerfile-model / scripts /11.1_evaluate_binary.py

LeeSek

Add scripts

e9b8340 verified 9 months ago

raw

history blame contribute delete

2.46 kB

	# 11.1_evaluate_binary_v3.py

	from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer
	from datasets import load_from_disk
	from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	from pathlib import Path
	import json

	# === Ścieżki
	MODEL_DIR = Path("models/binary/final")
	DATASET_DIR = Path("data/processed/dataset_binary")
	OUT_DIR = MODEL_DIR

	REPORT_CSV = OUT_DIR / "classification_report.csv"
	REPORT_JSON = OUT_DIR / "metrics.json"
	CONF_MATRIX_PNG = OUT_DIR / "confusion_matrix.png"

	# === Wczytaj model
	print("📂 Wczytywanie modelu...")
	model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)

	# === Wczytaj tokenizer z modelu lub zapasowy
	tokenizer_files = list(MODEL_DIR.glob("tokenizer*"))
	if tokenizer_files:
	tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
	else:
	print("⚠️ Brak tokenizera w modelu — pobieram z microsoft/codebert-base")
	tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
	tokenizer.save_pretrained(MODEL_DIR)

	# === Wczytaj dane
	ds = load_from_disk(str(DATASET_DIR))
	trainer = Trainer(model=model)

	# === Predykcja
	print("🔍 Predykcja na zbiorze testowym...")
	predictions = trainer.predict(ds["test"])
	y_pred = np.argmax(predictions.predictions, axis=1)
	y_true = predictions.label_ids

	# === Raport klasyfikacji
	print("\n📊 Raport klasyfikacji:")
	report_dict = classification_report(
	y_true, y_pred, target_names=["good", "bad"], zero_division=0, output_dict=True
	)
	report_text = classification_report(
	y_true, y_pred, target_names=["good", "bad"], zero_division=0
	)
	print(report_text)

	# Zapis CSV + JSON
	df_report = pd.DataFrame(report_dict).transpose()
	df_report.to_csv(REPORT_CSV)
	with open(REPORT_JSON, "w") as f:
	json.dump(report_dict, f, indent=2)

	print(f"💾 Zapisano raport CSV: {REPORT_CSV}")
	print(f"💾 Zapisano metryki JSON: {REPORT_JSON}")

	# === Macierz pomyłek + wykres
	conf_matrix = confusion_matrix(y_true, y_pred)
	labels = ["good", "bad"]
	disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=labels)

	plt.figure(figsize=(5, 4))
	disp.plot(cmap="Purples", values_format="d")
	plt.title("🧱 Confusion Matrix – Binary Classifier")
	plt.grid(False)
	plt.tight_layout()
	plt.savefig(CONF_MATRIX_PNG)
	plt.close()

	print(f"🖼️ Zapisano confusion matrix jako PNG: {CONF_MATRIX_PNG}")