Spaces:
Running
Running
| # multi/keyword_module.py | |
| import torch | |
| import requests | |
| from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration, AutoTokenizer, AutoModel | |
| from konlpy.tag import Komoran | |
| from keybert import KeyBERT | |
| from bs4 import BeautifulSoup as bs | |
| # --- 요약용 KoBART --- | |
| summary_tokenizer = PreTrainedTokenizerFast.from_pretrained("gogamza/kobart-summarization") | |
| summary_model = BartForConditionalGeneration.from_pretrained("gogamza/kobart-summarization") | |
| def summarize_kobart(text, max_input_length=512): | |
| # 입력을 자르기 | |
| input_ids = summary_tokenizer.encode(text, return_tensors="pt", truncation=True, max_length=max_input_length) | |
| summary_ids = summary_model.generate( | |
| input_ids, | |
| max_length=160, | |
| min_length=100, | |
| num_beams=4, | |
| repetition_penalty=2.5, | |
| no_repeat_ngram_size=3, | |
| early_stopping=True, | |
| ) | |
| return summary_tokenizer.decode(summary_ids[0], skip_special_tokens=True) | |
| # --- KoBERT 임베딩 클래스 --- | |
| class KoBERTEmbedding: | |
| def __init__(self, model, tokenizer): | |
| self.model = model | |
| self.tokenizer = tokenizer | |
| def encode(self, documents): | |
| if isinstance(documents, str): | |
| documents = [documents] | |
| encoded_input = self.tokenizer( | |
| documents, | |
| padding=True, | |
| truncation=True, | |
| max_length=512, | |
| return_tensors="pt" | |
| ) | |
| if "token_type_ids" not in encoded_input: | |
| encoded_input["token_type_ids"] = torch.zeros_like(encoded_input["input_ids"]) | |
| with torch.no_grad(): | |
| output = self.model(**encoded_input) | |
| return output.last_hidden_state[:, 0, :].numpy() | |
| # --- 키워드 추출 --- | |
| keyword_tokenizer = AutoTokenizer.from_pretrained("skt/kobert-base-v1", use_fast=False) | |
| keyword_model = AutoModel.from_pretrained("skt/kobert-base-v1") | |
| kobert_embedder = KoBERTEmbedding(keyword_model, keyword_tokenizer) | |
| kw_model = KeyBERT(model=kobert_embedder) | |
| # --- 불용어 로드 + 형태소 분석기 --- | |
| komoran = Komoran() | |
| def fetch_korean_stopwords(): | |
| url = "https://raw.githubusercontent.com/stopwords-iso/stopwords-ko/master/stopwords-ko.txt" | |
| response = requests.get(url) | |
| return response.text.splitlines() | |
| stopwords = fetch_korean_stopwords() | |
| def remove_stopwords(text, stopwords): | |
| nouns = komoran.nouns(text) | |
| return " ".join([w for w in nouns if w not in stopwords and len(w) > 1]) | |
| def extract_keywords(summary_text, top_n=5): | |
| filtered = remove_stopwords(summary_text, stopwords) | |
| keywords_1st = kw_model.extract_keywords( | |
| filtered, | |
| keyphrase_ngram_range=(1, 4), | |
| stop_words=stopwords, | |
| top_n=15 | |
| ) | |
| joined = " ".join([kw for kw, _ in keywords_1st]) | |
| keywords_2nd = kw_model.extract_keywords(joined, top_n=top_n) | |
| return keywords_1st, keywords_2nd | |
| # --- 뉴스 크롤링 --- | |
| def fetch_html(url): | |
| headers = {"User-Agent": "Mozilla/5.0"} | |
| response = requests.get(url, headers=headers, timeout=5) | |
| response.raise_for_status() | |
| return bs(response.text, "html.parser") | |
| def parse_naver(soup): | |
| title = soup.select_one("h2.media_end_head_headline") or soup.title | |
| time_tag = soup.select_one("span.media_end_head_info_datestamp_time") | |
| content_area = soup.find("div", {"id": "newsct_article"}) or soup.find("div", {"id": "dic_area"}) | |
| title_text = title.get_text(strip=True) if title else "제목 없음" | |
| time_text = time_tag.get_text(strip=True) if time_tag else "시간 없음" | |
| if content_area: | |
| paragraphs = content_area.find_all("p") | |
| content = '\n'.join([p.get_text(strip=True) for p in paragraphs]) if paragraphs else content_area.get_text(strip=True) | |
| else: | |
| content = "본문 없음" | |
| return title_text, time_text, content | |
| def parse_daum(soup): | |
| title = soup.select_one("h3.tit_view") or soup.title | |
| time_tag = soup.select_one("span.num_date") | |
| content_area = soup.find("div", {"class": "article_view"}) | |
| title_text = title.get_text(strip=True) if title else "제목 없음" | |
| time_text = time_tag.get_text(strip=True) if time_tag else "시간 없음" | |
| if content_area: | |
| paragraphs = content_area.find_all("p") | |
| content = '\n'.join([p.get_text(strip=True) for p in paragraphs]) if paragraphs else content_area.get_text(strip=True) | |
| else: | |
| content = "본문 없음" | |
| return title_text, time_text, content | |