| import streamlit as st |
| import torch |
| from transformers import AutoTokenizer, AutoModel, pipeline |
| from torch import nn |
|
|
| st.markdown("### Articles classificator.") |
|
|
| @st.cache(allow_output_mutation=True) |
| def get_tokenizer(): |
| model_name = 'microsoft/deberta-v3-small' |
| return AutoTokenizer.from_pretrained(model_name) |
|
|
| tokenizer = get_tokenizer() |
|
|
| class devops_model(nn.Module): |
| def __init__(self): |
| super(devops_model, self).__init__() |
| self.berta = None |
| self.fc = nn.Sequential( |
| nn.Linear(768, 768), |
| nn.ReLU(), |
| nn.Dropout(0.3), |
| nn.BatchNorm1d(768), |
| nn.Linear(768, 5), |
| nn.LogSoftmax(dim=-1) |
| ) |
| |
| def forward(self, train_batch): |
| emb = self.berta(**train_batch)['last_hidden_state'].mean(axis=1) |
| return self.fc(emb) |
|
|
| @st.cache |
| def LoadModel(): |
| return torch.load('model_full.pt', map_location=torch.device('cpu')) |
|
|
| model = LoadModel() |
|
|
| classes = ['Computer Science', 'Mathematics', 'Physics', 'Quantitative Biology', 'Statistics'] |
|
|
| def process(title, summary): |
| text = title + summary |
| if not text.strip(): |
| return '' |
| model.eval() |
| lines = [text] |
| X = tokenizer(lines, padding=True, truncation=True, return_tensors="pt") |
| out = model(X) |
| probs = torch.exp(out[0]) |
| sorted_indexes = torch.argsort(probs, descending=True) |
| probs_sum = idx = 0 |
| res = [] |
| while probs_sum < 0.95: |
| prob_idx = sorted_indexes[idx] |
| prob = probs[prob_idx] |
| res.append(f'{classes[prob_idx]}: {prob:.3f}') |
| idx += 1 |
| probs_sum += prob |
| return res |
| |
| title = st.text_area("Title", height=30) |
|
|
| summary = st.text_area("Summary", height=180) |
|
|
| for string in process(title, summary): |
| st.markdown(string) |