import streamlit as st from annotated_text import annotated_text import warnings import pandas as pd from pandas import DataFrame warnings.filterwarnings('ignore') import re, flair, random, time from bnlp import BasicTokenizer from flair.data import Corpus, Sentence from flair.datasets import ColumnCorpus from flair.models import SequenceTagger from flair.trainers import ModelTrainer st.set_page_config( page_title="বাংলা POS Tagger", page_icon="✔️", layout="wide", ) @st.cache_resource() def load_model(model_name): model = SequenceTagger.load(model_name) return (model) st.info('যাদবপুর বিশ্ববিদ্যালয়ের কম্পিউটার সায়েন্স অ্যান্ড ইঞ্জিনিয়ারিং বিভাগের একটি উদ্যোগ', icon="📚") activity = ['আপনার পছন্দ নির্বাচন করুন', 'ফাইল আপলোড (for SCTR use only)', 'ফাইল আপলোড (for PUBLIC use)', 'টেক্সট ইনপুট'] choice = st.sidebar.selectbox('আপনি কিভাবে এটি প্রক্রিয়া করতে চান?',activity) st.sidebar.markdown('''

BIS POS Tagset

''', unsafe_allow_html=True) st.sidebar.info(''' --> _Unknown_ CC_CCD --> _Co-ordinator_ CC_CCS --> _Subordinator_ CC_CCS_UT --> _Quotative_ DM_DMD --> _Deictic demonstrative_ DM_DMR --> _Relative demonstrative_ DM_DMQ --> _Wh-word_ JJ --> _Adjective_ N_NN --> _Common noun_ N_NNP --> _Proper noun_ N_NNV --> _Verbal noun_ N_NST --> _Locative noun_ PR_PRC --> _Reciprocal pronoun_ PR_PRF --> _Reflexive pronoun_ PR_PRL --> _Relative pronoun_ PR_PRP --> _Personal pronoun_ PR_PRQ --> _Wh-word_ PSP --> _Postposition_ QT_QTC --> _Cardinals_ QT_QTF --> _General quantifier_ RB --> _Adverb_ RD_ECH --> _Echo words_ RD_PUNC --> _Punctuation_ RD_RDF --> _Foreign words_ RD_SYM --> _Symbol_ RD_UNK --> _Unknown_ RP_CL --> _Classifier particle_ RP_INJ --> _Interjection particle_ RP_INTF --> _Intensifier particle_ RP_NEG --> _Negation particle_ RP_RPD --> _Default particle_ V_VAUX --> _Auxiliary verb_ V_VM --> _Main verb_ V_VM_VF --> _Finite verb_ V_VM_VINF --> _Infinite verb_ V_VM_VNF --> _Non-finite verb_ V_VM_VNG --> _Gerund verb_ QT_QTO --> _Ordinals_ ''') st.sidebar.info('সর্বশেষ সংশোধিত তারিখ: ০৪ এপ্রিল ২০২৫', icon="ℹ️") model = load_model('best-model-002.pt') #model = SequenceTagger.load('best-model-002.pt') if choice == 'টেক্সট ইনপুট': input_data = st.text_area("আপনার বাংলা বাক্য লিখুন", value="", height=10) if st.button('প্রক্রিয়া শুরু করতে ক্লিক করুন'): data = BasicTokenizer().tokenize(input_data) sentence = Sentence(data) # model = load_model('best-model-002.pt') model.predict(sentence) my_list = [] for token in sentence: word = [] word.append(token.text) word.append(token.tag) my_list.append(tuple(word)) annotated_text(my_list) if choice == 'ফাইল আপলোড (for PUBLIC use)': uploaded_file = st.file_uploader("আপনার ফাইল নির্বাচন করুন", type='.txt') if uploaded_file is not None: lines = uploaded_file.read().decode('utf-8').splitlines() # Define output Excel file name output_file_name = uploaded_file.name.split('.')[0] + '_tagged.xlsx' raw_sentences = [] tagged_sentences = [] with st.spinner("Wait for processing the file..."): for line in lines: data = BasicTokenizer().tokenize(line) sentence = Sentence(data) model.predict(sentence) my_list = [] for token in sentence: word = (token.text, token.tag) my_list.append(word) raw_line = ' '.join([f"{word}" for word, tag in my_list]) tagged_line = ' '.join([f"{word}/{tag}" for word, tag in my_list]) raw_sentences.append(raw_line) tagged_sentences.append(tagged_line) # Save to Excel df = pd.DataFrame({ "Raw Sentence": raw_sentences, "Tagged Sentence": tagged_sentences }) df.to_excel(output_file_name, index=False) # Provide download button with open(output_file_name, "rb") as f: btn = st.download_button( label="Download the tagged data in Excel (.xlsx) format", data=f, file_name=output_file_name, mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ) # if choice == 'ফাইল আপলোড (for PUBLIC use)': # uploaded_file = st.file_uploader("আপনার ফাইল নির্বাচন করুন", type='.txt') # if uploaded_file is not None: # lines = uploaded_file.read().decode('utf-8').splitlines() # # Define output file name # output_file_name = uploaded_file.name.split('.')[0] + '_tagged.txt' # with open(output_file_name, 'w', encoding='utf-8') as out_file: # for line in lines: # data = BasicTokenizer().tokenize(line) # sentence = Sentence(data) # model.predict(sentence) # my_list = [] # for token in sentence: # word = (token.text, token.tag) # my_list.append(word) # # Write line to output file # tagged_line = ' '.join([f"{word}/{tag}" for word, tag in my_list]) # out_file.write(tagged_line + '\n') # # Show annotated text # # annotated_text(*my_list) # #btn = st.download_button(label="TXT ফাইল হিসাবে ডেটা ডাউনলোড করুন",data=out_file, file_name=output_file_name) # with open(output_file_name, "rb") as f: # btn = st.download_button( # label="TXT ফাইল হিসাবে ডেটা ডাউনলোড করুন", # data=f, # file_name=output_file_name, # mime="text/plain" # ) if choice == 'ফাইল আপলোড (for SCTR use only)': uploaded_files = st.file_uploader("আপনার ফাইল নির্বাচন করুন") if uploaded_files is not None: search_word_def = uploaded_files.name.split('.')[0].split(' ')[-1] f_name = search_word_def + '.tsv' f = open(f_name, 'a') dataframe = pd.read_excel(uploaded_files) # st.write(dataframe.head()) for index, row in dataframe.iterrows(): if pd.notnull(row['Unnamed: 4']): data = BasicTokenizer().tokenize(row['Unnamed: 4']) sentence = Sentence(data) model.predict(sentence) search_w_d = [] search_w = [] my_list = [] for token in sentence: if token.text == search_word_def: w_d = [] w_d.append(token.text) w_d.append(token.tag) search_w_d.append("/".join(tuple(w_d))) word = [] word.append(token.text) word.append(token.tag) my_list.append("/".join(tuple(word))) f.write(str(row['Unnamed: 0'])+'\t'+str(row['Unnamed: 1'])+'\t'+str(row['Unnamed: 2'])+'\t'+str(row['Unnamed: 3'])+'\t'+str(" ".join(data))+'\t'+str(" ".join(my_list))+'\t'+str(" ".join(search_w_d))+"\n") f.close() with open(f_name, "rb") as file: btn = st.download_button(label="TSV ফাইল হিসাবে ডেটা ডাউনলোড করুন",data=file, file_name=f_name)