Spaces:
Runtime error
Runtime error
| # train_model.py | |
| # AI ๋ชจ๋ธ์ ํ๋ จํ๋ ์คํฌ๋ฆฝํธ, ๋ค์ ์ฌ์ฉ๊ฐ๋ฅํ ์ญ์ x | |
| import pandas as pd | |
| import json | |
| import re | |
| import sys | |
| import transformers | |
| import torch | |
| from transformers import AutoTokenizer | |
| # --- 1. ๋ฐ์ดํฐ ๋ก๋ฉ ๋ฐ ์ ์ฒ๋ฆฌ --- | |
| print("--- [Phase 1] ๋ฐ์ดํฐ ๋ก๋ฉ ๋ฐ ์ ์ฒ๋ฆฌ ์์ ---") | |
| # ํ์ผ ๊ฒฝ๋ก ์ค์ | |
| file_path = './data/' | |
| # ํ๋ จ/๊ฒ์ฆ ๋ฐ์ดํฐ ๋ก๋ฉ (์ด์ ๊ณผ ๋์ผ) | |
| with open(file_path + 'training-label.json', 'r', encoding='utf-8') as file: | |
| training_data_raw = json.load(file) | |
| with open(file_path + 'validation-label.json', 'r', encoding='utf-8') as file: | |
| validation_data_raw = json.load(file) | |
| # DataFrame ์์ฑ ํจ์ (์ฝ๋๋ฅผ ๊น๋ํ๊ฒ ํ๊ธฐ ์ํด ํจ์๋ก ๋ฌถ์) | |
| def create_dataframe(data_raw): | |
| extracted_data = [] | |
| for dialogue in data_raw: | |
| try: | |
| emotion_type = dialogue['profile']['emotion']['type'] | |
| dialogue_content = dialogue['talk']['content'] | |
| full_text = " ".join(list(dialogue_content.values())) | |
| if full_text and emotion_type: | |
| extracted_data.append({'text': full_text, 'emotion': emotion_type}) | |
| except KeyError: | |
| continue | |
| return pd.DataFrame(extracted_data) | |
| df_train = create_dataframe(training_data_raw) | |
| df_val = create_dataframe(validation_data_raw) | |
| # ํ ์คํธ ์ ์ | |
| def clean_text(text): | |
| return re.sub(r'[^๊ฐ-ํฃa-zA-Z0-9 ]', '', text) | |
| df_train['cleaned_text'] = df_train['text'].apply(clean_text) | |
| df_val['cleaned_text'] = df_val['text'].apply(clean_text) | |
| print("โ ๋ฐ์ดํฐ ๋ก๋ฉ ๋ฐ ์ ์ฒ๋ฆฌ ์๋ฃ!") | |
| # --- 2. AI ๋ชจ๋ธ๋ง ์ค๋น --- | |
| print("\n--- [Phase 2] AI ๋ชจ๋ธ๋ง ์ค๋น ์์ ---") | |
| # ๋ชจ๋ธ ๋ฐ ํ ํฌ๋์ด์ ๋ถ๋ฌ์ค๊ธฐ | |
| MODEL_NAME = "klue/roberta-base" | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| # ํ ์คํธ ํ ํฐํ | |
| train_tokenized = tokenizer(list(df_train['cleaned_text']), return_tensors="pt", max_length=128, padding=True, truncation=True) | |
| val_tokenized = tokenizer(list(df_val['cleaned_text']), return_tensors="pt", max_length=128, padding=True, truncation=True) | |
| # ๋ผ๋ฒจ ์ธ์ฝ๋ฉ | |
| unique_labels = sorted(df_train['emotion'].unique()) | |
| label_to_id = {label: id for id, label in enumerate(unique_labels)} | |
| id_to_label = {id: label for label, id in label_to_id.items()} | |
| df_train['label'] = df_train['emotion'].map(label_to_id) | |
| df_val['label'] = df_val['emotion'].map(label_to_id) | |
| print("โ ํ ํฐํ ๋ฐ ๋ผ๋ฒจ ์ธ์ฝ๋ฉ ์๋ฃ!") | |
| print("์ด์ ๋ชจ๋ธ ํ๋ จ์ ์ํ ๋ชจ๋ ์ค๋น๊ฐ ๋๋ฌ์ต๋๋ค.") | |
| # [Phase 3]์ ๊ธฐ์กด ์ฝ๋๋ฅผ ์๋ ๋ด์ฉ์ผ๋ก ๊ต์ฒดํด์ฃผ์ธ์. | |
| # ----------------------------------------------------------- | |
| # --- [Phase 3] ๋ชจ๋ธ ํ์ต ๋ฐ ํ๊ฐ (์ต์ ๊ธฐ๋ฅ ๋ฒ์ ) --- | |
| # ----------------------------------------------------------- | |
| import torch | |
| from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer | |
| from sklearn.metrics import accuracy_score, precision_recall_fscore_support | |
| print("\n--- [Phase 3] ๋ชจ๋ธ ํ์ต ๋ฐ ํ๊ฐ ์์ ---") | |
| # 1. PyTorch Dataset ํด๋์ค ์ ์ (์ด์ ๊ณผ ๋์ผ) | |
| class EmotionDataset(torch.utils.data.Dataset): | |
| def __init__(self, encodings, labels): | |
| self.encodings = encodings | |
| self.labels = labels | |
| def __getitem__(self, idx): | |
| item = {key: val[idx].clone().detach() for key, val in self.encodings.items()} | |
| item['labels'] = torch.tensor(self.labels[idx]) | |
| return item | |
| def __len__(self): | |
| return len(self.labels) | |
| train_dataset = EmotionDataset(train_tokenized, df_train['label'].tolist()) | |
| val_dataset = EmotionDataset(val_tokenized, df_val['label'].tolist()) | |
| print("โ PyTorch ๋ฐ์ดํฐ์ ์์ฑ์ด ์๋ฃ๋์์ต๋๋ค.") | |
| # 2. AI ๋ชจ๋ธ ๋ถ๋ฌ์ค๊ธฐ (์ด์ ๊ณผ ๋์ผ) | |
| model = AutoModelForSequenceClassification.from_pretrained( | |
| MODEL_NAME, | |
| num_labels=len(unique_labels), | |
| id2label=id_to_label, | |
| label2id=label_to_id | |
| ) | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| model.to(device) | |
| print(f"โ ๋ชจ๋ธ ๋ก๋ฉ ์๋ฃ! ๋ชจ๋ธ์ {device}์์ ์คํ๋ฉ๋๋ค.") | |
| # 3. ๋ชจ๋ธ ์ฑ๋ฅ ํ๊ฐ๋ฅผ ์ํ ํจ์ ์ ์ (์์ ์๋ฃ) | |
| def compute_metrics(pred): | |
| labels = pred.label_ids | |
| # ๋ฐ๋ก ์ด ๋ถ๋ถ์ด ์์ ๋์์ต๋๋ค. | |
| preds = pred.predictions.argmax(-1) | |
| precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0) | |
| acc = accuracy_score(labels, preds) | |
| return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall} | |
| # 4. ํ๋ จ์ ์ํ ์์ธ ์ค์ (Arguments) ์ ์ (๋ชจ๋ ๋ถ๊ฐ ์ต์ ์ ๊ฑฐ) | |
| training_args = TrainingArguments( | |
| output_dir='./results', # ๋ชจ๋ธ์ด ์ ์ฅ๋ ์์น (ํ์) | |
| num_train_epochs=3, # ํ๋ จ ํ์ | |
| per_device_train_batch_size=16, # ํ๋ จ ๋ฐฐ์น ์ฌ์ด์ฆ | |
| # ๋๋จธ์ง ๋ชจ๋ ํ๊ฐ/์ ์ฅ ๊ด๋ จ ์ต์ ์ ๋ชจ๋ ์ ๊ฑฐํฉ๋๋ค. | |
| ) | |
| # ---!!! ํต์ฌ ์์ ์ฌํญ 2 !!!--- | |
| # 5. Trainer ์ ์ (ํ๊ฐ ๊ด๋ จ ๊ธฐ๋ฅ ๋นํ์ฑํ) | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_dataset, | |
| # ํ๋ จ ์ค ํ๊ฐ๋ฅผ ํ์ง ์์ผ๋ฏ๋ก ์๋ ์ต์ ๋ค์ ์ ์ธํฉ๋๋ค. | |
| # eval_dataset=val_dataset, | |
| # compute_metrics=compute_metrics | |
| ) | |
| # 6. ๋ชจ๋ธ ํ๋ จ ์์! | |
| print("\n๐ฅ AI ๋ชจ๋ธ ํ๋ จ์ ์์ํฉ๋๋ค...") | |
| trainer.train() | |
| print("\n๐ ๋ชจ๋ธ ํ๋ จ ์๋ฃ!") | |
| # 7. ์ต์ข ๋ชจ๋ธ ํ๊ฐ๋ ํ๋ จ์ด ๋๋ ํ '๋ณ๋๋ก' ์คํ | |
| print("\n--- ์ต์ข ๋ชจ๋ธ ์ฑ๋ฅ ํ๊ฐ ---") | |
| # ๋นํ์ฑํํ๋ ํ๊ฐ ๋ฐ์ดํฐ์ ์ evaluate ํจ์์ ์ง์ ์ ๋ฌํด์ค๋๋ค. | |
| final_evaluation = trainer.evaluate(eval_dataset=val_dataset) | |
| print(final_evaluation) | |
| print("\n๋ชจ๋ ๊ณผ์ ์ด ์ฑ๊ณต์ ์ผ๋ก ๋๋ฌ์ต๋๋ค! results ํด๋์์ ํ๋ จ๋ ๋ชจ๋ธ์ ํ์ธํ์ธ์.") |