emotion-chatbot / scripts /train_model.py
kootaeng2
Initial commit with final, clean project files
e850536
# train_model.py
# AI ๋ชจ๋ธ์„ ํ›ˆ๋ จํ•˜๋Š” ์Šคํฌ๋ฆฝํŠธ, ๋‹ค์‹œ ์‚ฌ์šฉ๊ฐ€๋Šฅํ•œ ์‚ญ์ œ x
import pandas as pd
import json
import re
import sys
import transformers
import torch
from transformers import AutoTokenizer
# --- 1. ๋ฐ์ดํ„ฐ ๋กœ๋”ฉ ๋ฐ ์ „์ฒ˜๋ฆฌ ---
print("--- [Phase 1] ๋ฐ์ดํ„ฐ ๋กœ๋”ฉ ๋ฐ ์ „์ฒ˜๋ฆฌ ์‹œ์ž‘ ---")
# ํŒŒ์ผ ๊ฒฝ๋กœ ์„ค์ •
file_path = './data/'
# ํ›ˆ๋ จ/๊ฒ€์ฆ ๋ฐ์ดํ„ฐ ๋กœ๋”ฉ (์ด์ „๊ณผ ๋™์ผ)
with open(file_path + 'training-label.json', 'r', encoding='utf-8') as file:
training_data_raw = json.load(file)
with open(file_path + 'validation-label.json', 'r', encoding='utf-8') as file:
validation_data_raw = json.load(file)
# DataFrame ์ƒ์„ฑ ํ•จ์ˆ˜ (์ฝ”๋“œ๋ฅผ ๊น”๋”ํ•˜๊ฒŒ ํ•˜๊ธฐ ์œ„ํ•ด ํ•จ์ˆ˜๋กœ ๋ฌถ์Œ)
def create_dataframe(data_raw):
extracted_data = []
for dialogue in data_raw:
try:
emotion_type = dialogue['profile']['emotion']['type']
dialogue_content = dialogue['talk']['content']
full_text = " ".join(list(dialogue_content.values()))
if full_text and emotion_type:
extracted_data.append({'text': full_text, 'emotion': emotion_type})
except KeyError:
continue
return pd.DataFrame(extracted_data)
df_train = create_dataframe(training_data_raw)
df_val = create_dataframe(validation_data_raw)
# ํ…์ŠคํŠธ ์ •์ œ
def clean_text(text):
return re.sub(r'[^๊ฐ€-ํžฃa-zA-Z0-9 ]', '', text)
df_train['cleaned_text'] = df_train['text'].apply(clean_text)
df_val['cleaned_text'] = df_val['text'].apply(clean_text)
print("โœ… ๋ฐ์ดํ„ฐ ๋กœ๋”ฉ ๋ฐ ์ „์ฒ˜๋ฆฌ ์™„๋ฃŒ!")
# --- 2. AI ๋ชจ๋ธ๋ง ์ค€๋น„ ---
print("\n--- [Phase 2] AI ๋ชจ๋ธ๋ง ์ค€๋น„ ์‹œ์ž‘ ---")
# ๋ชจ๋ธ ๋ฐ ํ† ํฌ๋‚˜์ด์ € ๋ถˆ๋Ÿฌ์˜ค๊ธฐ
MODEL_NAME = "klue/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# ํ…์ŠคํŠธ ํ† ํฐํ™”
train_tokenized = tokenizer(list(df_train['cleaned_text']), return_tensors="pt", max_length=128, padding=True, truncation=True)
val_tokenized = tokenizer(list(df_val['cleaned_text']), return_tensors="pt", max_length=128, padding=True, truncation=True)
# ๋ผ๋ฒจ ์ธ์ฝ”๋”ฉ
unique_labels = sorted(df_train['emotion'].unique())
label_to_id = {label: id for id, label in enumerate(unique_labels)}
id_to_label = {id: label for label, id in label_to_id.items()}
df_train['label'] = df_train['emotion'].map(label_to_id)
df_val['label'] = df_val['emotion'].map(label_to_id)
print("โœ… ํ† ํฐํ™” ๋ฐ ๋ผ๋ฒจ ์ธ์ฝ”๋”ฉ ์™„๋ฃŒ!")
print("์ด์ œ ๋ชจ๋ธ ํ›ˆ๋ จ์„ ์œ„ํ•œ ๋ชจ๋“  ์ค€๋น„๊ฐ€ ๋๋‚ฌ์Šต๋‹ˆ๋‹ค.")
# [Phase 3]์˜ ๊ธฐ์กด ์ฝ”๋“œ๋ฅผ ์•„๋ž˜ ๋‚ด์šฉ์œผ๋กœ ๊ต์ฒดํ•ด์ฃผ์„ธ์š”.
# -----------------------------------------------------------
# --- [Phase 3] ๋ชจ๋ธ ํ•™์Šต ๋ฐ ํ‰๊ฐ€ (์ตœ์†Œ ๊ธฐ๋Šฅ ๋ฒ„์ „) ---
# -----------------------------------------------------------
import torch
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
print("\n--- [Phase 3] ๋ชจ๋ธ ํ•™์Šต ๋ฐ ํ‰๊ฐ€ ์‹œ์ž‘ ---")
# 1. PyTorch Dataset ํด๋ž˜์Šค ์ •์˜ (์ด์ „๊ณผ ๋™์ผ)
class EmotionDataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
train_dataset = EmotionDataset(train_tokenized, df_train['label'].tolist())
val_dataset = EmotionDataset(val_tokenized, df_val['label'].tolist())
print("โœ… PyTorch ๋ฐ์ดํ„ฐ์…‹ ์ƒ์„ฑ์ด ์™„๋ฃŒ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.")
# 2. AI ๋ชจ๋ธ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ (์ด์ „๊ณผ ๋™์ผ)
model = AutoModelForSequenceClassification.from_pretrained(
MODEL_NAME,
num_labels=len(unique_labels),
id2label=id_to_label,
label2id=label_to_id
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"โœ… ๋ชจ๋ธ ๋กœ๋”ฉ ์™„๋ฃŒ! ๋ชจ๋ธ์€ {device}์—์„œ ์‹คํ–‰๋ฉ๋‹ˆ๋‹ค.")
# 3. ๋ชจ๋ธ ์„ฑ๋Šฅ ํ‰๊ฐ€๋ฅผ ์œ„ํ•œ ํ•จ์ˆ˜ ์ •์˜ (์ˆ˜์ • ์™„๋ฃŒ)
def compute_metrics(pred):
labels = pred.label_ids
# ๋ฐ”๋กœ ์ด ๋ถ€๋ถ„์ด ์ˆ˜์ •๋˜์—ˆ์Šต๋‹ˆ๋‹ค.
preds = pred.predictions.argmax(-1)
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0)
acc = accuracy_score(labels, preds)
return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}
# 4. ํ›ˆ๋ จ์„ ์œ„ํ•œ ์ƒ์„ธ ์„ค์ •(Arguments) ์ •์˜ (๋ชจ๋“  ๋ถ€๊ฐ€ ์˜ต์…˜ ์ œ๊ฑฐ)
training_args = TrainingArguments(
output_dir='./results', # ๋ชจ๋ธ์ด ์ €์žฅ๋  ์œ„์น˜ (ํ•„์ˆ˜)
num_train_epochs=3, # ํ›ˆ๋ จ ํšŸ์ˆ˜
per_device_train_batch_size=16, # ํ›ˆ๋ จ ๋ฐฐ์น˜ ์‚ฌ์ด์ฆˆ
# ๋‚˜๋จธ์ง€ ๋ชจ๋“  ํ‰๊ฐ€/์ €์žฅ ๊ด€๋ จ ์˜ต์…˜์€ ๋ชจ๋‘ ์ œ๊ฑฐํ•ฉ๋‹ˆ๋‹ค.
)
# ---!!! ํ•ต์‹ฌ ์ˆ˜์ • ์‚ฌํ•ญ 2 !!!---
# 5. Trainer ์ •์˜ (ํ‰๊ฐ€ ๊ด€๋ จ ๊ธฐ๋Šฅ ๋น„ํ™œ์„ฑํ™”)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
# ํ›ˆ๋ จ ์ค‘ ํ‰๊ฐ€๋ฅผ ํ•˜์ง€ ์•Š์œผ๋ฏ€๋กœ ์•„๋ž˜ ์˜ต์…˜๋“ค์€ ์ œ์™ธํ•ฉ๋‹ˆ๋‹ค.
# eval_dataset=val_dataset,
# compute_metrics=compute_metrics
)
# 6. ๋ชจ๋ธ ํ›ˆ๋ จ ์‹œ์ž‘!
print("\n๐Ÿ”ฅ AI ๋ชจ๋ธ ํ›ˆ๋ จ์„ ์‹œ์ž‘ํ•ฉ๋‹ˆ๋‹ค...")
trainer.train()
print("\n๐ŸŽ‰ ๋ชจ๋ธ ํ›ˆ๋ จ ์™„๋ฃŒ!")
# 7. ์ตœ์ข… ๋ชจ๋ธ ํ‰๊ฐ€๋Š” ํ›ˆ๋ จ์ด ๋๋‚œ ํ›„ '๋ณ„๋„๋กœ' ์‹คํ–‰
print("\n--- ์ตœ์ข… ๋ชจ๋ธ ์„ฑ๋Šฅ ํ‰๊ฐ€ ---")
# ๋น„ํ™œ์„ฑํ™”ํ–ˆ๋˜ ํ‰๊ฐ€ ๋ฐ์ดํ„ฐ์…‹์„ evaluate ํ•จ์ˆ˜์— ์ง์ ‘ ์ „๋‹ฌํ•ด์ค๋‹ˆ๋‹ค.
final_evaluation = trainer.evaluate(eval_dataset=val_dataset)
print(final_evaluation)
print("\n๋ชจ๋“  ๊ณผ์ •์ด ์„ฑ๊ณต์ ์œผ๋กœ ๋๋‚ฌ์Šต๋‹ˆ๋‹ค! results ํด๋”์—์„œ ํ›ˆ๋ จ๋œ ๋ชจ๋ธ์„ ํ™•์ธํ•˜์„ธ์š”.")