| | """ |
| | literal2idiomatic ver: d-1-2 |
| | """ |
| | import os |
| | from idiomify.paths import ROOT_DIR |
| | from idiomify.fetchers import fetch_pie, fetch_config |
| | from idiomify.preprocess import upsample, cleanse, stratified_split, annotate |
| | import wandb |
| |
|
| |
|
| | def main(): |
| |
|
| | |
| | pie_df = fetch_pie() |
| | config = fetch_config()['literal2idiomatic'] |
| | train_df, test_df = pie_df.pipe(cleanse)\ |
| | .pipe(upsample, seed=config['seed'])\ |
| | .pipe(annotate, boi_token=config['boi_token'], eoi_token=config['eoi_token'])\ |
| | .pipe(stratified_split, ratio=config['train_ratio'], seed=config['seed']) |
| | |
| | train_df = train_df[["Idiom", "Sense", "Literal_Sent", "Idiomatic_Sent"]] |
| | test_df = test_df[["Idiom", "Sense", "Literal_Sent", "Idiomatic_Sent"]] |
| | dfs = (train_df, test_df) |
| | with wandb.init(entity="eubinecto", project="idiomify") as run: |
| | |
| | train_path = ROOT_DIR / "train.tsv" |
| | test_path = ROOT_DIR / "test.tsv" |
| | paths = (train_path, test_path) |
| | artifact = wandb.Artifact(name="literal2idiomatic", type="dataset", description=config['description'], |
| | metadata=config) |
| | for tsv_path, df in zip(paths, dfs): |
| | df.to_csv(tsv_path, sep="\t") |
| | artifact.add_file(tsv_path) |
| | |
| | run.log_artifact(artifact, aliases=["latest", config['ver']]) |
| | |
| | for tsv_path in paths: |
| | os.remove(tsv_path) |
| |
|
| |
|
| | if __name__ == '__main__': |
| | main() |
| |
|