import streamlit as st
from annotated_text import annotated_text

import warnings
import pandas as pd
from pandas import DataFrame


warnings.filterwarnings('ignore')
import re, flair, random, time
from bnlp import BasicTokenizer
from flair.data import Corpus, Sentence
from flair.datasets import ColumnCorpus

from flair.models import SequenceTagger
from flair.trainers import ModelTrainer


st.set_page_config(
   page_title="বাংলা POS Tagger",
   page_icon="✔️",
   layout="wide",
)


@st.cache_resource()
def load_model(model_name):
    model = SequenceTagger.load(model_name)
    return (model)

st.info('যাদবপুর বিশ্ববিদ্যালয়ের কম্পিউটার সায়েন্স অ্যান্ড ইঞ্জিনিয়ারিং বিভাগের একটি উদ্যোগ', icon="📚")


activity = ['আপনার পছন্দ নির্বাচন করুন', 'ফাইল আপলোড (for SCTR use only)', 'ফাইল আপলোড (for PUBLIC use)', 'টেক্সট ইনপুট']
choice = st.sidebar.selectbox('আপনি কিভাবে এটি প্রক্রিয়া করতে চান?',activity)

st.sidebar.markdown('''<h3><center><b><u>BIS POS Tagset</u></b><center></h3>''', unsafe_allow_html=True)

st.sidebar.info('''
<unk> --> _Unknown_  
CC_CCD --> _Co-ordinator_  
CC_CCS --> _Subordinator_  
CC_CCS_UT --> _Quotative_  
DM_DMD --> _Deictic demonstrative_  
DM_DMR --> _Relative demonstrative_  
DM_DMQ --> _Wh-word_  
JJ --> _Adjective_  
N_NN --> _Common noun_  
N_NNP --> _Proper noun_  
N_NNV --> _Verbal noun_  
N_NST --> _Locative noun_  
PR_PRC --> _Reciprocal pronoun_  
PR_PRF --> _Reflexive pronoun_  
PR_PRL --> _Relative pronoun_  
PR_PRP --> _Personal pronoun_  
PR_PRQ --> _Wh-word_  
PSP --> _Postposition_  
QT_QTC --> _Cardinals_  
QT_QTF --> _General quantifier_  
RB --> _Adverb_  
RD_ECH --> _Echo words_  
RD_PUNC --> _Punctuation_  
RD_RDF --> _Foreign words_  
RD_SYM --> _Symbol_  
RD_UNK --> _Unknown_  
RP_CL --> _Classifier particle_  
RP_INJ --> _Interjection particle_  
RP_INTF --> _Intensifier particle_  
RP_NEG --> _Negation particle_  
RP_RPD --> _Default particle_  
V_VAUX --> _Auxiliary verb_  
V_VM --> _Main verb_  
V_VM_VF --> _Finite verb_  
V_VM_VINF --> _Infinite verb_  
V_VM_VNF --> _Non-finite verb_  
V_VM_VNG --> _Gerund verb_  
QT_QTO --> _Ordinals_  
''')
st.sidebar.info('সর্বশেষ সংশোধিত তারিখ: ০৪ এপ্রিল ২০২৫', icon="ℹ️")


model = load_model('best-model-002.pt')
#model = SequenceTagger.load('best-model-002.pt')


if choice == 'টেক্সট ইনপুট':
    input_data = st.text_area("আপনার বাংলা বাক্য লিখুন", value="", height=10)
    if st.button('প্রক্রিয়া শুরু করতে ক্লিক করুন'):
       
        data = BasicTokenizer().tokenize(input_data)
        
       
        sentence = Sentence(data)
        # model = load_model('best-model-002.pt')
        model.predict(sentence)
        my_list = []
      
        for token in sentence:
            word = []
            word.append(token.text)
            word.append(token.tag)
            my_list.append(tuple(word))
        annotated_text(my_list)

if choice == 'ফাইল আপলোড (for PUBLIC use)':
    uploaded_file = st.file_uploader("আপনার ফাইল নির্বাচন করুন", type='.txt')
    if uploaded_file is not None:
        lines = uploaded_file.read().decode('utf-8').splitlines()

        # Define output Excel file name
        output_file_name = uploaded_file.name.split('.')[0] + '_tagged.xlsx'

        raw_sentences = []
        tagged_sentences = []

        with st.spinner("Wait for processing the file..."):
            for line in lines:
                data = BasicTokenizer().tokenize(line)
                sentence = Sentence(data)
                model.predict(sentence)
                my_list = []

                for token in sentence:
                    word = (token.text, token.tag)
                    my_list.append(word)

                raw_line = ' '.join([f"{word}" for word, tag in my_list])
                tagged_line = ' '.join([f"{word}/{tag}" for word, tag in my_list])

                raw_sentences.append(raw_line)
                tagged_sentences.append(tagged_line)

            # Save to Excel
            df = pd.DataFrame({
                "Raw Sentence": raw_sentences,
                "Tagged Sentence": tagged_sentences
            })

            df.to_excel(output_file_name, index=False)

        # Provide download button
        with open(output_file_name, "rb") as f:
            btn = st.download_button(
                label="Download the tagged data in Excel (.xlsx) format",
                data=f,
                file_name=output_file_name,
                mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
            )

# if choice == 'ফাইল আপলোড (for PUBLIC use)':
#     uploaded_file = st.file_uploader("আপনার ফাইল নির্বাচন করুন", type='.txt')
#     if uploaded_file is not None:
#         lines = uploaded_file.read().decode('utf-8').splitlines()

#         # Define output file name
#         output_file_name = uploaded_file.name.split('.')[0] + '_tagged.txt'

#         with open(output_file_name, 'w', encoding='utf-8') as out_file:
#             for line in lines:
#                 data = BasicTokenizer().tokenize(line)
#                 sentence = Sentence(data)
#                 model.predict(sentence)

#                 my_list = []

#                 for token in sentence:
#                     word = (token.text, token.tag)
#                     my_list.append(word)

#                 # Write line to output file
#                 tagged_line = ' '.join([f"{word}/{tag}" for word, tag in my_list])
#                 out_file.write(tagged_line + '\n')

#                 # Show annotated text
#                 # annotated_text(*my_list)

#             #btn = st.download_button(label="TXT ফাইল হিসাবে ডেটা ডাউনলোড করুন",data=out_file, file_name=output_file_name)
#         with open(output_file_name, "rb") as f:
#             btn = st.download_button(
#                 label="TXT ফাইল হিসাবে ডেটা ডাউনলোড করুন",
#                 data=f,
#                 file_name=output_file_name,
#                 mime="text/plain"
#             )

        
if choice == 'ফাইল আপলোড (for SCTR use only)':
    uploaded_files = st.file_uploader("আপনার ফাইল নির্বাচন করুন")

    if uploaded_files is not None:
        search_word_def = uploaded_files.name.split('.')[0].split(' ')[-1]
        f_name = search_word_def + '.tsv'

        f = open(f_name, 'a')
        dataframe = pd.read_excel(uploaded_files)
        # st.write(dataframe.head())

        for index, row in dataframe.iterrows():
            if pd.notnull(row['Unnamed: 4']):
                data = BasicTokenizer().tokenize(row['Unnamed: 4'])
                
                sentence = Sentence(data)
                model.predict(sentence)
                
                search_w_d = []
                search_w = []
                my_list = []
                for token in sentence:
                    if token.text == search_word_def:
                        w_d = []
                        w_d.append(token.text)
                        w_d.append(token.tag)
                        search_w_d.append("/".join(tuple(w_d)))
                    word = []
                    word.append(token.text)
                    word.append(token.tag)
                    my_list.append("/".join(tuple(word)))

                f.write(str(row['Unnamed: 0'])+'\t'+str(row['Unnamed: 1'])+'\t'+str(row['Unnamed: 2'])+'\t'+str(row['Unnamed: 3'])+'\t'+str(" ".join(data))+'\t'+str(" ".join(my_list))+'\t'+str(" ".join(search_w_d))+"\n")
        f.close()
        with open(f_name, "rb") as file:
            btn = st.download_button(label="TSV ফাইল হিসাবে ডেটা ডাউনলোড করুন",data=file, file_name=f_name)