Spaces:

Leesn465
/

fastapi-stock-api

Running

App Files Files Community

Leesn465 commited on Oct 30

Commit

d26b332

verified ·

1 Parent(s): 3ae9a70

Update util/keywordExtract.py

Browse files

Files changed (1) hide show

util/keywordExtract.py +34 -22

util/keywordExtract.py CHANGED Viewed

@@ -21,7 +21,6 @@ def load_company_list(file_path='상장법인목록.xls'):
 summary_tokenizer = PreTrainedTokenizerFast.from_pretrained("gogamza/kobart-summarization")
 summary_model = BartForConditionalGeneration.from_pretrained("gogamza/kobart-summarization")
 def summarize_kobart(text):
     input_ids = summary_tokenizer.encode(text, return_tensors="pt")
     summary_ids = summary_model.generate(
@@ -58,34 +57,46 @@ kw_model = KeyBERT(model=kobert_embedder)
 STOPWORDS_FILE = "stopwords-ko.txt"
-# ✅ 감성 분석용 모델 (예: kykim/bert-kor-base 사용 가정)
-sentiment_model_name = "kykim/bert-kor-base"
 bert_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_name)
 bert_model = AutoModelForSequenceClassification.from_pretrained(sentiment_model_name)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-bert_model = bert_model.to(device)
-def classify_emotion(text):
-    tokens = bert_tokenizer(text, padding=True, truncation=True, return_tensors="pt").to(device)
-    with torch.no_grad():
-        prediction = bert_model(**tokens)
-    prediction = F.softmax(prediction.logits, dim=1)
-    output = prediction.argmax(dim=1).item()
-    labels = ["부정적", "중립적", "긍정적"]
-    return labels[output]
-sentiment_tokenizer = BertTokenizer.from_pretrained("kykim/bert-kor-base")
-sentiment_model = BertForSequenceClassification.from_pretrained("kykim/bert-kor-base")
 def analyze_sentiment(text):
-    inputs = sentiment_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
     with torch.no_grad():
-        outputs = sentiment_model(**inputs)
-        probs = F.softmax(outputs.logits, dim=1)
-        return {
-            "positive": round(float(probs[0][1]), 4),
-            "negative": round(float(probs[0][0]), 4)
-        }
 def get_or_download_stopwords():
     # 1. 파일이 있으면 읽어서 반환
@@ -125,8 +136,9 @@ def resultKeyword(content) :
         # 불용어 처리 후 요약 텍스트에서 키워드 추출
     filtered_summary = remove_stopwords(summary, korean_stopwords)
     keywords = kw_model.extract_keywords(
-        filtered_summary,
         keyphrase_ngram_range=(1, 2),  # 복합명사 유지 가능
         stop_words=None,
         top_n=5

 summary_tokenizer = PreTrainedTokenizerFast.from_pretrained("gogamza/kobart-summarization")
 summary_model = BartForConditionalGeneration.from_pretrained("gogamza/kobart-summarization")
 def summarize_kobart(text):
     input_ids = summary_tokenizer.encode(text, return_tensors="pt")
     summary_ids = summary_model.generate(
 STOPWORDS_FILE = "stopwords-ko.txt"
+# ✅ 감성 분석용 모델 (예: snunlp/KR-FinBert-SC 사용)
+sentiment_model_name = "snunlp/KR-FinBert-SC"
 bert_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_name)
 bert_model = AutoModelForSequenceClassification.from_pretrained(sentiment_model_name)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+sentiment_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_name)   # 👈 tokenizer 정의
+sentiment_model = AutoModelForSequenceClassification.from_pretrained(sentiment_model_name)
+sentiment_model = sentiment_model.to(device)
 def analyze_sentiment(text):
+    inputs = sentiment_tokenizer(
+        text,
+        return_tensors="pt",
+        truncation=True,
+        padding=True,
+        max_length=512  # 👈 추가
+    ).to(device)
+        # 모델 추론
     with torch.no_grad():
+        outputs = bert_model(**inputs)
+        logits = outputs.logits
+#확률 계산
+    print("logits:", logits)
+    print("logits.shape:", logits.shape)
+    probs = F.softmax(logits, dim=1)[0]
+#라벨링
+    label_idx = torch.argmax(probs).item()
+    labels = ["부정적", "중립적", "긍정적"]
+    label = labels[label_idx]
+    return {
+        "negative": round(float(probs[0]), 4),
+        "neutral": round(float(probs[1]), 4),
+        "positive": round(float(probs[2]), 4),
+    }
 def get_or_download_stopwords():
     # 1. 파일이 있으면 읽어서 반환
         # 불용어 처리 후 요약 텍스트에서 키워드 추출
     filtered_summary = remove_stopwords(summary, korean_stopwords)
+    filtered_content = remove_stopwords(content, korean_stopwords)
     keywords = kw_model.extract_keywords(
+        filtered_content,
         keyphrase_ngram_range=(1, 2),  # 복합명사 유지 가능
         stop_words=None,
         top_n=5