File size: 5,849 Bytes
e850536
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# train_model.py
# AI λͺ¨λΈμ„ ν›ˆλ ¨ν•˜λŠ” 슀크립트, λ‹€μ‹œ μ‚¬μš©κ°€λŠ₯ν•œ μ‚­μ œ x

import pandas as pd
import json
import re
import sys
import transformers
import torch

from transformers import AutoTokenizer

# --- 1. 데이터 λ‘œλ”© 및 μ „μ²˜λ¦¬ ---

print("--- [Phase 1] 데이터 λ‘œλ”© 및 μ „μ²˜λ¦¬ μ‹œμž‘ ---")
# 파일 경둜 μ„€μ •
file_path = './data/'

# ν›ˆλ ¨/검증 데이터 λ‘œλ”© (이전과 동일)
with open(file_path + 'training-label.json', 'r', encoding='utf-8') as file:
    training_data_raw = json.load(file)
with open(file_path + 'validation-label.json', 'r', encoding='utf-8') as file:
    validation_data_raw = json.load(file)

# DataFrame 생성 ν•¨μˆ˜ (μ½”λ“œλ₯Ό κΉ”λ”ν•˜κ²Œ ν•˜κΈ° μœ„ν•΄ ν•¨μˆ˜λ‘œ 묢음)
def create_dataframe(data_raw):
    extracted_data = []
    for dialogue in data_raw:
        try:
            emotion_type = dialogue['profile']['emotion']['type']
            dialogue_content = dialogue['talk']['content']
            full_text = " ".join(list(dialogue_content.values()))
            if full_text and emotion_type:
                extracted_data.append({'text': full_text, 'emotion': emotion_type})
        except KeyError:
            continue
    return pd.DataFrame(extracted_data)

df_train = create_dataframe(training_data_raw)
df_val = create_dataframe(validation_data_raw)

# ν…μŠ€νŠΈ μ •μ œ
def clean_text(text):
    return re.sub(r'[^κ°€-힣a-zA-Z0-9 ]', '', text)

df_train['cleaned_text'] = df_train['text'].apply(clean_text)
df_val['cleaned_text'] = df_val['text'].apply(clean_text)
print("βœ… 데이터 λ‘œλ”© 및 μ „μ²˜λ¦¬ μ™„λ£Œ!")


# --- 2. AI λͺ¨λΈλ§ μ€€λΉ„ ---
print("\n--- [Phase 2] AI λͺ¨λΈλ§ μ€€λΉ„ μ‹œμž‘ ---")
# λͺ¨λΈ 및 ν† ν¬λ‚˜μ΄μ € 뢈러였기
MODEL_NAME = "klue/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# ν…μŠ€νŠΈ 토큰화
train_tokenized = tokenizer(list(df_train['cleaned_text']), return_tensors="pt", max_length=128, padding=True, truncation=True)
val_tokenized = tokenizer(list(df_val['cleaned_text']), return_tensors="pt", max_length=128, padding=True, truncation=True)

# 라벨 인코딩
unique_labels = sorted(df_train['emotion'].unique())
label_to_id = {label: id for id, label in enumerate(unique_labels)}
id_to_label = {id: label for label, id in label_to_id.items()}
df_train['label'] = df_train['emotion'].map(label_to_id)
df_val['label'] = df_val['emotion'].map(label_to_id)
print("βœ… 토큰화 및 라벨 인코딩 μ™„λ£Œ!")
print("이제 λͺ¨λΈ ν›ˆλ ¨μ„ μœ„ν•œ λͺ¨λ“  μ€€λΉ„κ°€ λλ‚¬μŠ΅λ‹ˆλ‹€.")


# [Phase 3]의 κΈ°μ‘΄ μ½”λ“œλ₯Ό μ•„λž˜ λ‚΄μš©μœΌλ‘œ κ΅μ²΄ν•΄μ£Όμ„Έμš”.
# -----------------------------------------------------------
# --- [Phase 3] λͺ¨λΈ ν•™μŠ΅ 및 평가 (μ΅œμ†Œ κΈ°λŠ₯ 버전) ---
# -----------------------------------------------------------
import torch
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

print("\n--- [Phase 3] λͺ¨λΈ ν•™μŠ΅ 및 평가 μ‹œμž‘ ---")

# 1. PyTorch Dataset 클래슀 μ •μ˜ (이전과 동일)
class EmotionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = EmotionDataset(train_tokenized, df_train['label'].tolist())
val_dataset = EmotionDataset(val_tokenized, df_val['label'].tolist())
print("βœ… PyTorch 데이터셋 생성이 μ™„λ£Œλ˜μ—ˆμŠ΅λ‹ˆλ‹€.")

# 2. AI λͺ¨λΈ 뢈러였기 (이전과 동일)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, 
    num_labels=len(unique_labels),
    id2label=id_to_label,
    label2id=label_to_id
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"βœ… λͺ¨λΈ λ‘œλ”© μ™„λ£Œ! λͺ¨λΈμ€ {device}μ—μ„œ μ‹€ν–‰λ©λ‹ˆλ‹€.")


# 3. λͺ¨λΈ μ„±λŠ₯ 평가λ₯Ό μœ„ν•œ ν•¨μˆ˜ μ •μ˜ (μˆ˜μ • μ™„λ£Œ)
def compute_metrics(pred):
    labels = pred.label_ids
    # λ°”λ‘œ 이 뢀뢄이 μˆ˜μ •λ˜μ—ˆμŠ΅λ‹ˆλ‹€.
    preds = pred.predictions.argmax(-1)
    
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0)
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

# 4. ν›ˆλ ¨μ„ μœ„ν•œ 상세 μ„€μ •(Arguments) μ •μ˜ (λͺ¨λ“  λΆ€κ°€ μ˜΅μ…˜ 제거)
training_args = TrainingArguments(
    output_dir='./results',          # λͺ¨λΈμ΄ μ €μž₯될 μœ„μΉ˜ (ν•„μˆ˜)
    num_train_epochs=3,              # ν›ˆλ ¨ 횟수
    per_device_train_batch_size=16,  # ν›ˆλ ¨ 배치 μ‚¬μ΄μ¦ˆ
    # λ‚˜λ¨Έμ§€ λͺ¨λ“  평가/μ €μž₯ κ΄€λ ¨ μ˜΅μ…˜μ€ λͺ¨λ‘ μ œκ±°ν•©λ‹ˆλ‹€.
)

# ---!!! 핡심 μˆ˜μ • 사항 2 !!!---
# 5. Trainer μ •μ˜ (평가 κ΄€λ ¨ κΈ°λŠ₯ λΉ„ν™œμ„±ν™”)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    # ν›ˆλ ¨ 쀑 평가λ₯Ό ν•˜μ§€ μ•ŠμœΌλ―€λ‘œ μ•„λž˜ μ˜΅μ…˜λ“€μ€ μ œμ™Έν•©λ‹ˆλ‹€.
    # eval_dataset=val_dataset,
    # compute_metrics=compute_metrics
)

# 6. λͺ¨λΈ ν›ˆλ ¨ μ‹œμž‘!
print("\nπŸ”₯ AI λͺ¨λΈ ν›ˆλ ¨μ„ μ‹œμž‘ν•©λ‹ˆλ‹€...")
trainer.train()
print("\nπŸŽ‰ λͺ¨λΈ ν›ˆλ ¨ μ™„λ£Œ!")

# 7. μ΅œμ’… λͺ¨λΈ ν‰κ°€λŠ” ν›ˆλ ¨μ΄ λλ‚œ ν›„ 'λ³„λ„λ‘œ' μ‹€ν–‰
print("\n--- μ΅œμ’… λͺ¨λΈ μ„±λŠ₯ 평가 ---")
# λΉ„ν™œμ„±ν™”ν–ˆλ˜ 평가 데이터셋을 evaluate ν•¨μˆ˜μ— 직접 μ „λ‹¬ν•΄μ€λ‹ˆλ‹€.
final_evaluation = trainer.evaluate(eval_dataset=val_dataset) 
print(final_evaluation)

print("\nλͺ¨λ“  과정이 μ„±κ³΅μ μœΌλ‘œ λλ‚¬μŠ΅λ‹ˆλ‹€! results ν΄λ”μ—μ„œ ν›ˆλ ¨λœ λͺ¨λΈμ„ ν™•μΈν•˜μ„Έμš”.")