Spaces:
Sleeping
Sleeping
OliverPerrin
commited on
Commit
·
6bae907
1
Parent(s):
a45b0d7
Expand datasets: GoEmotions (28 emotions), Yahoo Answers (10 topics), BookSum
Browse files- Updated download_data.py with new dataset downloaders
- Updated preprocess_data.py to handle JSONL format
- Updated labels.json with 28 emotions and 10 topics
- Updated datasets.yaml with new dataset configs
- Updated full.yaml training config (warmup_steps: 1000)
- Training results: 74% topic accuracy, ROUGE 0.284
- artifacts/labels.json +32 -4
- configs/data/datasets.yaml +49 -7
- configs/training/full.yaml +1 -1
- outputs/training_history.json +39 -39
- scripts/download_data.py +319 -204
- scripts/preprocess_data.py +30 -12
artifacts/labels.json
CHANGED
|
@@ -1,16 +1,44 @@
|
|
| 1 |
{
|
| 2 |
"emotion": [
|
|
|
|
|
|
|
| 3 |
"anger",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
"fear",
|
|
|
|
|
|
|
| 5 |
"joy",
|
| 6 |
"love",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
"sadness",
|
| 8 |
"surprise"
|
| 9 |
],
|
| 10 |
"topic": [
|
| 11 |
-
"Business",
|
| 12 |
-
"
|
| 13 |
-
"
|
| 14 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
]
|
| 16 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"emotion": [
|
| 3 |
+
"admiration",
|
| 4 |
+
"amusement",
|
| 5 |
"anger",
|
| 6 |
+
"annoyance",
|
| 7 |
+
"approval",
|
| 8 |
+
"caring",
|
| 9 |
+
"confusion",
|
| 10 |
+
"curiosity",
|
| 11 |
+
"desire",
|
| 12 |
+
"disappointment",
|
| 13 |
+
"disapproval",
|
| 14 |
+
"disgust",
|
| 15 |
+
"embarrassment",
|
| 16 |
+
"excitement",
|
| 17 |
"fear",
|
| 18 |
+
"gratitude",
|
| 19 |
+
"grief",
|
| 20 |
"joy",
|
| 21 |
"love",
|
| 22 |
+
"nervousness",
|
| 23 |
+
"neutral",
|
| 24 |
+
"optimism",
|
| 25 |
+
"pride",
|
| 26 |
+
"realization",
|
| 27 |
+
"relief",
|
| 28 |
+
"remorse",
|
| 29 |
"sadness",
|
| 30 |
"surprise"
|
| 31 |
],
|
| 32 |
"topic": [
|
| 33 |
+
"Business & Finance",
|
| 34 |
+
"Computers & Internet",
|
| 35 |
+
"Education & Reference",
|
| 36 |
+
"Entertainment & Music",
|
| 37 |
+
"Family & Relationships",
|
| 38 |
+
"Health",
|
| 39 |
+
"Politics & Government",
|
| 40 |
+
"Science & Mathematics",
|
| 41 |
+
"Society & Culture",
|
| 42 |
+
"Sports"
|
| 43 |
]
|
| 44 |
}
|
configs/data/datasets.yaml
CHANGED
|
@@ -1,21 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
| 1 |
raw:
|
| 2 |
-
summarization: data/raw/summarization
|
| 3 |
emotion: data/raw/emotion
|
| 4 |
topic: data/raw/topic
|
| 5 |
books: data/raw/books
|
|
|
|
| 6 |
processed:
|
| 7 |
summarization: data/processed/summarization
|
| 8 |
emotion: data/processed/emotion
|
| 9 |
topic: data/processed/topic
|
| 10 |
books: data/processed/books
|
|
|
|
| 11 |
tokenizer:
|
| 12 |
pretrained_model_name: google/flan-t5-base
|
| 13 |
max_length: 512
|
| 14 |
lower: false
|
|
|
|
|
|
|
| 15 |
downloads:
|
|
|
|
| 16 |
summarization:
|
| 17 |
-
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
books:
|
| 20 |
- name: pride_and_prejudice
|
| 21 |
url: https://www.gutenberg.org/cache/epub/1342/pg1342.txt
|
|
@@ -29,7 +63,15 @@ downloads:
|
|
| 29 |
- name: moby_dick
|
| 30 |
url: https://www.gutenberg.org/cache/epub/2701/pg2701.txt
|
| 31 |
output: data/raw/books/moby_dick.txt
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Dataset configuration for LexiMind
|
| 2 |
+
# Expanded dataset support for comprehensive emotion and topic classification
|
| 3 |
+
|
| 4 |
raw:
|
| 5 |
+
summarization: data/raw/summarization
|
| 6 |
emotion: data/raw/emotion
|
| 7 |
topic: data/raw/topic
|
| 8 |
books: data/raw/books
|
| 9 |
+
|
| 10 |
processed:
|
| 11 |
summarization: data/processed/summarization
|
| 12 |
emotion: data/processed/emotion
|
| 13 |
topic: data/processed/topic
|
| 14 |
books: data/processed/books
|
| 15 |
+
|
| 16 |
tokenizer:
|
| 17 |
pretrained_model_name: google/flan-t5-base
|
| 18 |
max_length: 512
|
| 19 |
lower: false
|
| 20 |
+
|
| 21 |
+
# Dataset download configuration
|
| 22 |
downloads:
|
| 23 |
+
# Summarization: CNN/DailyMail (287K) + BookSum (9.6K)
|
| 24 |
summarization:
|
| 25 |
+
- name: cnn_dailymail
|
| 26 |
+
dataset: cnn_dailymail
|
| 27 |
+
config: "3.0.0"
|
| 28 |
+
source_field: article
|
| 29 |
+
target_field: highlights
|
| 30 |
+
max_samples: 100000 # Subset for training time
|
| 31 |
+
- name: booksum
|
| 32 |
+
dataset: kmfoda/booksum
|
| 33 |
+
source_field: chapter
|
| 34 |
+
target_field: summary
|
| 35 |
+
max_samples: 9600 # Full dataset
|
| 36 |
+
|
| 37 |
+
# Emotions: GoEmotions (28 emotions, 43K samples)
|
| 38 |
+
emotion:
|
| 39 |
+
dataset: google-research-datasets/go_emotions
|
| 40 |
+
config: simplified
|
| 41 |
+
text_field: text
|
| 42 |
+
label_field: labels
|
| 43 |
+
multi_label: true
|
| 44 |
+
|
| 45 |
+
# Topics: Yahoo Answers (10 topics, 1.4M samples)
|
| 46 |
+
topic:
|
| 47 |
+
dataset: yahoo_answers_topics
|
| 48 |
+
text_field: best_answer # Use the answer text
|
| 49 |
+
label_field: topic
|
| 50 |
+
max_samples: 200000 # Subset for reasonable training time
|
| 51 |
+
|
| 52 |
+
# Project Gutenberg books for inference demos
|
| 53 |
books:
|
| 54 |
- name: pride_and_prejudice
|
| 55 |
url: https://www.gutenberg.org/cache/epub/1342/pg1342.txt
|
|
|
|
| 63 |
- name: moby_dick
|
| 64 |
url: https://www.gutenberg.org/cache/epub/2701/pg2701.txt
|
| 65 |
output: data/raw/books/moby_dick.txt
|
| 66 |
+
- name: dracula
|
| 67 |
+
url: https://www.gutenberg.org/cache/epub/345/pg345.txt
|
| 68 |
+
output: data/raw/books/dracula.txt
|
| 69 |
+
- name: alice_in_wonderland
|
| 70 |
+
url: https://www.gutenberg.org/cache/epub/11/pg11.txt
|
| 71 |
+
output: data/raw/books/alice_in_wonderland.txt
|
| 72 |
+
- name: great_gatsby
|
| 73 |
+
url: https://www.gutenberg.org/cache/epub/64317/pg64317.txt
|
| 74 |
+
output: data/raw/books/great_gatsby.txt
|
| 75 |
+
- name: war_and_peace
|
| 76 |
+
url: https://www.gutenberg.org/cache/epub/2600/pg2600.txt
|
| 77 |
+
output: data/raw/books/war_and_peace.txt
|
configs/training/full.yaml
CHANGED
|
@@ -19,7 +19,7 @@ optimizer:
|
|
| 19 |
|
| 20 |
scheduler:
|
| 21 |
name: cosine
|
| 22 |
-
warmup_steps:
|
| 23 |
|
| 24 |
trainer:
|
| 25 |
max_epochs: 3
|
|
|
|
| 19 |
|
| 20 |
scheduler:
|
| 21 |
name: cosine
|
| 22 |
+
warmup_steps: 1000
|
| 23 |
|
| 24 |
trainer:
|
| 25 |
max_epochs: 3
|
outputs/training_history.json
CHANGED
|
@@ -1,59 +1,59 @@
|
|
| 1 |
{
|
| 2 |
"train_epoch_1": {
|
| 3 |
-
"summarization_loss":
|
| 4 |
-
"summarization_rouge_like": 0.
|
| 5 |
-
"emotion_loss": 0.
|
| 6 |
-
"emotion_f1": 0.
|
| 7 |
-
"topic_loss":
|
| 8 |
-
"topic_accuracy": 0.
|
| 9 |
-
"total_loss":
|
| 10 |
"epoch": 1.0
|
| 11 |
},
|
| 12 |
"val_epoch_1": {
|
| 13 |
-
"summarization_loss":
|
| 14 |
-
"summarization_rouge_like": 0.
|
| 15 |
-
"emotion_loss": 0.
|
| 16 |
-
"emotion_f1": 0.
|
| 17 |
-
"topic_loss": 0.
|
| 18 |
-
"topic_accuracy": 0.
|
| 19 |
"epoch": 1.0
|
| 20 |
},
|
| 21 |
"train_epoch_2": {
|
| 22 |
-
"summarization_loss":
|
| 23 |
-
"summarization_rouge_like": 0.
|
| 24 |
-
"emotion_loss": 0.
|
| 25 |
-
"emotion_f1": 0.
|
| 26 |
-
"topic_loss": 0.
|
| 27 |
-
"topic_accuracy": 0.
|
| 28 |
-
"total_loss":
|
| 29 |
"epoch": 2.0
|
| 30 |
},
|
| 31 |
"val_epoch_2": {
|
| 32 |
-
"summarization_loss":
|
| 33 |
-
"summarization_rouge_like": 0.
|
| 34 |
-
"emotion_loss": 0.
|
| 35 |
-
"emotion_f1": 0.
|
| 36 |
-
"topic_loss": 0.
|
| 37 |
-
"topic_accuracy": 0.
|
| 38 |
"epoch": 2.0
|
| 39 |
},
|
| 40 |
"train_epoch_3": {
|
| 41 |
-
"summarization_loss":
|
| 42 |
-
"summarization_rouge_like": 0.
|
| 43 |
-
"emotion_loss": 0.
|
| 44 |
-
"emotion_f1": 0.
|
| 45 |
-
"topic_loss": 0.
|
| 46 |
-
"topic_accuracy": 0.
|
| 47 |
-
"total_loss":
|
| 48 |
"epoch": 3.0
|
| 49 |
},
|
| 50 |
"val_epoch_3": {
|
| 51 |
-
"summarization_loss":
|
| 52 |
-
"summarization_rouge_like": 0.
|
| 53 |
-
"emotion_loss": 0.
|
| 54 |
-
"emotion_f1": 0.
|
| 55 |
-
"topic_loss": 0.
|
| 56 |
-
"topic_accuracy": 0.
|
| 57 |
"epoch": 3.0
|
| 58 |
}
|
| 59 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"train_epoch_1": {
|
| 3 |
+
"summarization_loss": 5.035701327740604,
|
| 4 |
+
"summarization_rouge_like": 0.16390742100245742,
|
| 5 |
+
"emotion_loss": 0.21049204537547025,
|
| 6 |
+
"emotion_f1": 0.002655381929628719,
|
| 7 |
+
"topic_loss": 1.176912516972419,
|
| 8 |
+
"topic_accuracy": 0.6581478229164939,
|
| 9 |
+
"total_loss": 6.423106049642868,
|
| 10 |
"epoch": 1.0
|
| 11 |
},
|
| 12 |
"val_epoch_1": {
|
| 13 |
+
"summarization_loss": 4.6882993674363105,
|
| 14 |
+
"summarization_rouge_like": 0.19405199466966144,
|
| 15 |
+
"emotion_loss": 0.15183634538985658,
|
| 16 |
+
"emotion_f1": 0.0016098967067287486,
|
| 17 |
+
"topic_loss": 0.8788343331143526,
|
| 18 |
+
"topic_accuracy": 0.7251652262328394,
|
| 19 |
"epoch": 1.0
|
| 20 |
},
|
| 21 |
"train_epoch_2": {
|
| 22 |
+
"summarization_loss": 4.561023824777751,
|
| 23 |
+
"summarization_rouge_like": 0.20945581532076613,
|
| 24 |
+
"emotion_loss": 0.14958151845580364,
|
| 25 |
+
"emotion_f1": 0.008022325540815077,
|
| 26 |
+
"topic_loss": 0.8585619787599033,
|
| 27 |
+
"topic_accuracy": 0.7299605100316837,
|
| 28 |
+
"total_loss": 5.569167470253677,
|
| 29 |
"epoch": 2.0
|
| 30 |
},
|
| 31 |
"val_epoch_2": {
|
| 32 |
+
"summarization_loss": 4.335443331423179,
|
| 33 |
+
"summarization_rouge_like": 0.2383154143354784,
|
| 34 |
+
"emotion_loss": 0.1478777239331147,
|
| 35 |
+
"emotion_f1": 0.010150822387259202,
|
| 36 |
+
"topic_loss": 0.841049696600522,
|
| 37 |
+
"topic_accuracy": 0.7359938993390932,
|
| 38 |
"epoch": 2.0
|
| 39 |
},
|
| 40 |
"train_epoch_3": {
|
| 41 |
+
"summarization_loss": 4.332563984521343,
|
| 42 |
+
"summarization_rouge_like": 0.24358268281949097,
|
| 43 |
+
"emotion_loss": 0.14520242059475916,
|
| 44 |
+
"emotion_f1": 0.026584760984350638,
|
| 45 |
+
"topic_loss": 0.8084657974773926,
|
| 46 |
+
"topic_accuracy": 0.7434995609372882,
|
| 47 |
+
"total_loss": 5.286232347914138,
|
| 48 |
"epoch": 3.0
|
| 49 |
},
|
| 50 |
"val_epoch_3": {
|
| 51 |
+
"summarization_loss": 4.0994785383502785,
|
| 52 |
+
"summarization_rouge_like": 0.2839536633314319,
|
| 53 |
+
"emotion_loss": 0.14214695994858215,
|
| 54 |
+
"emotion_f1": 0.028164719230763854,
|
| 55 |
+
"topic_loss": 0.8218616072552484,
|
| 56 |
+
"topic_accuracy": 0.7413319776309091,
|
| 57 |
"epoch": 3.0
|
| 58 |
}
|
| 59 |
}
|
scripts/download_data.py
CHANGED
|
@@ -1,8 +1,11 @@
|
|
| 1 |
"""
|
| 2 |
Dataset download script for LexiMind.
|
| 3 |
|
| 4 |
-
Downloads training datasets from
|
| 5 |
-
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
Author: Oliver Perrin
|
| 8 |
Date: December 2025
|
|
@@ -12,15 +15,17 @@ from __future__ import annotations
|
|
| 12 |
|
| 13 |
import argparse
|
| 14 |
import json
|
|
|
|
| 15 |
import socket
|
| 16 |
import sys
|
| 17 |
from pathlib import Path
|
| 18 |
-
from
|
| 19 |
-
from typing import Iterable, Iterator, cast
|
| 20 |
from urllib.error import URLError
|
| 21 |
from urllib.request import urlopen
|
| 22 |
|
| 23 |
-
from datasets import ClassLabel,
|
|
|
|
|
|
|
| 24 |
|
| 25 |
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
| 26 |
if str(PROJECT_ROOT) not in sys.path:
|
|
@@ -29,228 +34,338 @@ if str(PROJECT_ROOT) not in sys.path:
|
|
| 29 |
from src.utils.config import load_yaml
|
| 30 |
|
| 31 |
DOWNLOAD_TIMEOUT = 60
|
| 32 |
-
DEFAULT_SUMMARIZATION_DATASET = "gowrishankarp/newspaper-text-summarization-cnn-dailymail"
|
| 33 |
-
DEFAULT_EMOTION_DATASET = "dair-ai/emotion"
|
| 34 |
-
DEFAULT_TOPIC_DATASET = "ag_news"
|
| 35 |
-
DEFAULT_BOOK_URL = "https://www.gutenberg.org/cache/epub/1342/pg1342.txt"
|
| 36 |
-
DEFAULT_BOOK_OUTPUT = "data/raw/books/pride_and_prejudice.txt"
|
| 37 |
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
|
| 64 |
def gutenberg_download(url: str, output_path: str) -> None:
|
|
|
|
| 65 |
target = Path(output_path)
|
| 66 |
target.parent.mkdir(parents=True, exist_ok=True)
|
| 67 |
try:
|
| 68 |
-
with urlopen(url, timeout=DOWNLOAD_TIMEOUT) as response
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
except (URLError, socket.timeout, OSError) as error:
|
| 74 |
-
raise RuntimeError(f"Failed to download '{url}' to '{target}': {error}") from error
|
| 75 |
|
| 76 |
|
| 77 |
-
|
| 78 |
-
parser = argparse.ArgumentParser(description="Download datasets required for LexiMind training")
|
| 79 |
-
parser.add_argument(
|
| 80 |
-
"--config",
|
| 81 |
-
default="configs/data/datasets.yaml",
|
| 82 |
-
help="Path to the dataset configuration YAML.",
|
| 83 |
-
)
|
| 84 |
-
parser.add_argument(
|
| 85 |
-
"--skip-kaggle",
|
| 86 |
-
action="store_true",
|
| 87 |
-
help="Skip downloading the Kaggle summarization dataset.",
|
| 88 |
-
)
|
| 89 |
-
parser.add_argument(
|
| 90 |
-
"--skip-book", action="store_true", help="Skip downloading Gutenberg book texts."
|
| 91 |
-
)
|
| 92 |
-
return parser.parse_args()
|
| 93 |
|
| 94 |
|
| 95 |
-
def
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
config_path = Path(path)
|
| 99 |
-
if not config_path.exists():
|
| 100 |
-
raise FileNotFoundError(f"Config file not found: {config_path}")
|
| 101 |
-
return load_yaml(str(config_path)).data
|
| 102 |
|
|
|
|
|
|
|
| 103 |
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
else:
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
else:
|
| 156 |
-
|
| 157 |
-
yield {"text": text, "topic": topic}
|
| 158 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
|
| 160 |
-
|
| 161 |
-
args = parse_args()
|
| 162 |
-
config = _safe_load_config(args.config)
|
| 163 |
|
| 164 |
-
|
| 165 |
-
|
|
|
|
|
|
|
|
|
|
| 166 |
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
)
|
| 170 |
-
summarization_dataset = summarization_cfg.get("dataset", DEFAULT_SUMMARIZATION_DATASET)
|
| 171 |
-
summarization_output = summarization_cfg.get(
|
| 172 |
-
"output", raw_paths.get("summarization", "data/raw/summarization")
|
| 173 |
-
)
|
| 174 |
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
if
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
topic_cfg = downloads_cfg.get("topic", {}) if isinstance(downloads_cfg, dict) else {}
|
| 235 |
-
topic_name = topic_cfg.get("dataset", DEFAULT_TOPIC_DATASET)
|
| 236 |
-
topic_dir = Path(raw_paths.get("topic", "data/raw/topic"))
|
| 237 |
-
topic_dir.mkdir(parents=True, exist_ok=True)
|
| 238 |
-
print(f"Downloading topic dataset '{topic_name}' -> {topic_dir}")
|
| 239 |
-
topic_dataset = cast(DatasetDict, load_dataset(topic_name))
|
| 240 |
-
first_topic_key = next(iter(topic_dataset.keys()), None) if topic_dataset else None
|
| 241 |
-
topic_label_feature = (
|
| 242 |
-
topic_dataset[first_topic_key].features.get("label")
|
| 243 |
-
if first_topic_key is not None
|
| 244 |
-
else None
|
| 245 |
)
|
| 246 |
-
|
| 247 |
-
|
| 248 |
)
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
|
|
|
|
|
|
| 252 |
|
| 253 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
|
| 255 |
|
| 256 |
if __name__ == "__main__":
|
|
|
|
| 1 |
"""
|
| 2 |
Dataset download script for LexiMind.
|
| 3 |
|
| 4 |
+
Downloads training datasets from HuggingFace Hub and Project Gutenberg:
|
| 5 |
+
- GoEmotions: 28 emotion labels (43K samples)
|
| 6 |
+
- Yahoo Answers: 10 topic labels (1.4M samples, subset to 200K)
|
| 7 |
+
- CNN/DailyMail + BookSum: Summarization (100K + 9.6K samples)
|
| 8 |
+
- Gutenberg: Classic books for inference demos
|
| 9 |
|
| 10 |
Author: Oliver Perrin
|
| 11 |
Date: December 2025
|
|
|
|
| 15 |
|
| 16 |
import argparse
|
| 17 |
import json
|
| 18 |
+
import random
|
| 19 |
import socket
|
| 20 |
import sys
|
| 21 |
from pathlib import Path
|
| 22 |
+
from typing import Any, cast
|
|
|
|
| 23 |
from urllib.error import URLError
|
| 24 |
from urllib.request import urlopen
|
| 25 |
|
| 26 |
+
from datasets import ClassLabel, DatasetDict, load_dataset
|
| 27 |
+
from datasets import Sequence as DatasetSequence
|
| 28 |
+
from tqdm import tqdm
|
| 29 |
|
| 30 |
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
| 31 |
if str(PROJECT_ROOT) not in sys.path:
|
|
|
|
| 34 |
from src.utils.config import load_yaml
|
| 35 |
|
| 36 |
DOWNLOAD_TIMEOUT = 60
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
+
# --------------- Label Definitions ---------------
|
| 39 |
+
|
| 40 |
+
EMOTION_LABELS = [
|
| 41 |
+
"admiration",
|
| 42 |
+
"amusement",
|
| 43 |
+
"anger",
|
| 44 |
+
"annoyance",
|
| 45 |
+
"approval",
|
| 46 |
+
"caring",
|
| 47 |
+
"confusion",
|
| 48 |
+
"curiosity",
|
| 49 |
+
"desire",
|
| 50 |
+
"disappointment",
|
| 51 |
+
"disapproval",
|
| 52 |
+
"disgust",
|
| 53 |
+
"embarrassment",
|
| 54 |
+
"excitement",
|
| 55 |
+
"fear",
|
| 56 |
+
"gratitude",
|
| 57 |
+
"grief",
|
| 58 |
+
"joy",
|
| 59 |
+
"love",
|
| 60 |
+
"nervousness",
|
| 61 |
+
"optimism",
|
| 62 |
+
"pride",
|
| 63 |
+
"realization",
|
| 64 |
+
"relief",
|
| 65 |
+
"remorse",
|
| 66 |
+
"sadness",
|
| 67 |
+
"surprise",
|
| 68 |
+
"neutral",
|
| 69 |
+
]
|
| 70 |
+
|
| 71 |
+
TOPIC_LABELS = [
|
| 72 |
+
"Society & Culture",
|
| 73 |
+
"Science & Mathematics",
|
| 74 |
+
"Health",
|
| 75 |
+
"Education & Reference",
|
| 76 |
+
"Computers & Internet",
|
| 77 |
+
"Sports",
|
| 78 |
+
"Business & Finance",
|
| 79 |
+
"Entertainment & Music",
|
| 80 |
+
"Family & Relationships",
|
| 81 |
+
"Politics & Government",
|
| 82 |
+
]
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
# --------------- Utility Functions ---------------
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def _write_jsonl(records: list[dict], destination: Path, desc: str = "Writing") -> None:
|
| 89 |
+
"""Write records to JSONL file with progress bar."""
|
| 90 |
+
destination.parent.mkdir(parents=True, exist_ok=True)
|
| 91 |
+
with destination.open("w", encoding="utf-8") as f:
|
| 92 |
+
for record in tqdm(records, desc=desc, leave=False):
|
| 93 |
+
f.write(json.dumps(record, ensure_ascii=False) + "\n")
|
| 94 |
|
| 95 |
|
| 96 |
def gutenberg_download(url: str, output_path: str) -> None:
|
| 97 |
+
"""Download a text file from Project Gutenberg."""
|
| 98 |
target = Path(output_path)
|
| 99 |
target.parent.mkdir(parents=True, exist_ok=True)
|
| 100 |
try:
|
| 101 |
+
with urlopen(url, timeout=DOWNLOAD_TIMEOUT) as response:
|
| 102 |
+
content = response.read()
|
| 103 |
+
target.write_bytes(content)
|
| 104 |
+
except (URLError, socket.timeout, OSError) as e:
|
| 105 |
+
raise RuntimeError(f"Failed to download '{url}': {e}") from e
|
|
|
|
|
|
|
| 106 |
|
| 107 |
|
| 108 |
+
# --------------- Emotion Dataset (GoEmotions) ---------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
|
| 111 |
+
def download_emotion_dataset(output_dir: Path, config: dict) -> None:
|
| 112 |
+
"""Download GoEmotions dataset with 28 emotion labels."""
|
| 113 |
+
print("\n�� Downloading GoEmotions (28 emotions)...")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
+
dataset_name = config.get("dataset", "google-research-datasets/go_emotions")
|
| 116 |
+
dataset_config = config.get("config", "simplified")
|
| 117 |
|
| 118 |
+
ds = cast(DatasetDict, load_dataset(dataset_name, dataset_config))
|
| 119 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 120 |
+
|
| 121 |
+
# Get label names from dataset
|
| 122 |
+
label_feature = ds["train"].features.get("labels")
|
| 123 |
+
inner_feature = getattr(label_feature, "feature", None)
|
| 124 |
+
if isinstance(label_feature, DatasetSequence) and isinstance(inner_feature, ClassLabel):
|
| 125 |
+
label_names = cast(list[str], inner_feature.names)
|
| 126 |
+
else:
|
| 127 |
+
label_names = EMOTION_LABELS
|
| 128 |
+
|
| 129 |
+
for split_name, split in ds.items():
|
| 130 |
+
records = []
|
| 131 |
+
for item in tqdm(split, desc=f"Processing {split_name}", leave=False):
|
| 132 |
+
row = cast(dict[str, Any], item)
|
| 133 |
+
text = row.get("text", "")
|
| 134 |
+
label_indices = row.get("labels", [])
|
| 135 |
+
# Convert indices to label names
|
| 136 |
+
emotions = [label_names[i] for i in label_indices if 0 <= i < len(label_names)]
|
| 137 |
+
if text and emotions:
|
| 138 |
+
records.append({"text": text, "emotions": emotions})
|
| 139 |
+
|
| 140 |
+
output_path = output_dir / f"{split_name}.jsonl"
|
| 141 |
+
_write_jsonl(records, output_path, f"Writing {split_name}")
|
| 142 |
+
print(f" ✓ {split_name}: {len(records):,} samples -> {output_path}")
|
| 143 |
+
|
| 144 |
+
# Save label names
|
| 145 |
+
labels_path = output_dir / "labels.json"
|
| 146 |
+
labels_path.write_text(json.dumps(label_names, indent=2))
|
| 147 |
+
print(f" ✓ Labels ({len(label_names)}): {labels_path}")
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
# --------------- Topic Dataset (Yahoo Answers) ---------------
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def download_topic_dataset(output_dir: Path, config: dict) -> None:
|
| 154 |
+
"""Download Yahoo Answers dataset with 10 topic labels."""
|
| 155 |
+
print("\n📥 Downloading Yahoo Answers (10 topics)...")
|
| 156 |
+
|
| 157 |
+
dataset_name = config.get("dataset", "yahoo_answers_topics")
|
| 158 |
+
max_samples = config.get("max_samples", 200000)
|
| 159 |
+
|
| 160 |
+
ds = cast(DatasetDict, load_dataset(dataset_name))
|
| 161 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 162 |
+
|
| 163 |
+
# Get label names
|
| 164 |
+
label_feature = ds["train"].features.get("topic")
|
| 165 |
+
if isinstance(label_feature, ClassLabel):
|
| 166 |
+
label_names = label_feature.names
|
| 167 |
+
else:
|
| 168 |
+
label_names = TOPIC_LABELS
|
| 169 |
+
|
| 170 |
+
for split_name, split in ds.items():
|
| 171 |
+
# Determine sample limit for this split
|
| 172 |
+
if split_name == "train":
|
| 173 |
+
limit = max_samples
|
| 174 |
else:
|
| 175 |
+
limit = min(len(split), max_samples // 10)
|
| 176 |
+
|
| 177 |
+
# Random sample if needed
|
| 178 |
+
indices = list(range(len(split)))
|
| 179 |
+
if len(indices) > limit:
|
| 180 |
+
random.seed(42)
|
| 181 |
+
indices = random.sample(indices, limit)
|
| 182 |
+
|
| 183 |
+
records = []
|
| 184 |
+
for idx in tqdm(indices, desc=f"Processing {split_name}", leave=False):
|
| 185 |
+
item = cast(dict[str, Any], split[idx])
|
| 186 |
+
# Combine question and best answer for richer text
|
| 187 |
+
question = item.get("question_title", "") + " " + item.get("question_content", "")
|
| 188 |
+
answer = item.get("best_answer", "")
|
| 189 |
+
text = (question + " " + answer).strip()
|
| 190 |
+
|
| 191 |
+
topic_idx = item.get("topic", 0)
|
| 192 |
+
topic = label_names[topic_idx] if 0 <= topic_idx < len(label_names) else str(topic_idx)
|
| 193 |
+
|
| 194 |
+
if text and len(text) > 50: # Filter very short texts
|
| 195 |
+
records.append({"text": text, "topic": topic})
|
| 196 |
+
|
| 197 |
+
output_path = output_dir / f"{split_name}.jsonl"
|
| 198 |
+
_write_jsonl(records, output_path, f"Writing {split_name}")
|
| 199 |
+
print(f" ✓ {split_name}: {len(records):,} samples -> {output_path}")
|
| 200 |
+
|
| 201 |
+
# Save label names
|
| 202 |
+
labels_path = output_dir / "labels.json"
|
| 203 |
+
labels_path.write_text(json.dumps(label_names, indent=2))
|
| 204 |
+
print(f" ✓ Labels ({len(label_names)}): {labels_path}")
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
# --------------- Summarization Dataset (CNN/DailyMail + BookSum) ---------------
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
def download_summarization_datasets(output_dir: Path, config: list[dict]) -> None:
|
| 211 |
+
"""Download summarization datasets (CNN/DailyMail and BookSum)."""
|
| 212 |
+
print("\n📥 Downloading Summarization datasets...")
|
| 213 |
+
|
| 214 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 215 |
+
all_train, all_val, all_test = [], [], []
|
| 216 |
+
|
| 217 |
+
for ds_config in config:
|
| 218 |
+
name = ds_config.get("name", "unknown")
|
| 219 |
+
dataset_name = ds_config.get("dataset")
|
| 220 |
+
dataset_config = ds_config.get("config")
|
| 221 |
+
source_field = ds_config.get("source_field", "article")
|
| 222 |
+
target_field = ds_config.get("target_field", "highlights")
|
| 223 |
+
max_samples = ds_config.get("max_samples")
|
| 224 |
+
|
| 225 |
+
print(f"\n Loading {name}...")
|
| 226 |
+
|
| 227 |
+
if not dataset_name:
|
| 228 |
+
print(f" ✗ Skipping {name}: no dataset specified")
|
| 229 |
+
continue
|
| 230 |
+
|
| 231 |
+
if dataset_config:
|
| 232 |
+
ds = cast(DatasetDict, load_dataset(str(dataset_name), str(dataset_config)))
|
| 233 |
else:
|
| 234 |
+
ds = cast(DatasetDict, load_dataset(str(dataset_name)))
|
|
|
|
| 235 |
|
| 236 |
+
for split_name, split in ds.items():
|
| 237 |
+
split_str = str(split_name)
|
| 238 |
+
# Determine limit
|
| 239 |
+
limit = max_samples if max_samples else len(split)
|
| 240 |
+
if split_str != "train":
|
| 241 |
+
limit = min(len(split), limit // 10)
|
| 242 |
|
| 243 |
+
indices = list(range(min(len(split), limit)))
|
|
|
|
|
|
|
| 244 |
|
| 245 |
+
records = []
|
| 246 |
+
for idx in tqdm(indices, desc=f"{name}/{split_str}", leave=False):
|
| 247 |
+
item = cast(dict[str, Any], split[idx])
|
| 248 |
+
source = item.get(source_field, "")
|
| 249 |
+
target = item.get(target_field, "")
|
| 250 |
|
| 251 |
+
if source and target and len(str(source)) > 100:
|
| 252 |
+
records.append({"source": source, "summary": target})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
|
| 254 |
+
# Route to appropriate split
|
| 255 |
+
if "train" in split_str:
|
| 256 |
+
all_train.extend(records)
|
| 257 |
+
elif "val" in split_str or "validation" in split_str:
|
| 258 |
+
all_val.extend(records)
|
| 259 |
+
else:
|
| 260 |
+
all_test.extend(records)
|
| 261 |
+
|
| 262 |
+
print(f" ✓ {split_name}: {len(records):,} samples")
|
| 263 |
+
|
| 264 |
+
# Write combined files
|
| 265 |
+
if all_train:
|
| 266 |
+
_write_jsonl(all_train, output_dir / "train.jsonl", "Writing train")
|
| 267 |
+
print(f" ✓ Combined train: {len(all_train):,} samples")
|
| 268 |
+
if all_val:
|
| 269 |
+
_write_jsonl(all_val, output_dir / "validation.jsonl", "Writing validation")
|
| 270 |
+
print(f" ✓ Combined validation: {len(all_val):,} samples")
|
| 271 |
+
if all_test:
|
| 272 |
+
_write_jsonl(all_test, output_dir / "test.jsonl", "Writing test")
|
| 273 |
+
print(f" ✓ Combined test: {len(all_test):,} samples")
|
| 274 |
+
|
| 275 |
+
|
| 276 |
+
# --------------- Book Downloads (Gutenberg) ---------------
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
def download_books(books_dir: Path, config: list[dict]) -> None:
|
| 280 |
+
"""Download classic books from Project Gutenberg."""
|
| 281 |
+
print("\n📥 Downloading Gutenberg books...")
|
| 282 |
+
|
| 283 |
+
books_dir.mkdir(parents=True, exist_ok=True)
|
| 284 |
+
|
| 285 |
+
for book in config:
|
| 286 |
+
name = book.get("name", "unknown")
|
| 287 |
+
url = book.get("url")
|
| 288 |
+
output = book.get("output", str(books_dir / f"{name}.txt"))
|
| 289 |
+
|
| 290 |
+
if not url:
|
| 291 |
+
continue
|
| 292 |
+
|
| 293 |
+
output_path = Path(output)
|
| 294 |
+
if output_path.exists():
|
| 295 |
+
print(f" ✓ {name}: already exists")
|
| 296 |
+
continue
|
| 297 |
+
|
| 298 |
+
try:
|
| 299 |
+
print(f" ⏳ {name}: downloading...")
|
| 300 |
+
gutenberg_download(url, str(output_path))
|
| 301 |
+
print(f" ✓ {name}: {output_path}")
|
| 302 |
+
except Exception as e:
|
| 303 |
+
print(f" ✗ {name}: {e}")
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
# --------------- Main Entry Point ---------------
|
| 307 |
+
|
| 308 |
+
|
| 309 |
+
def parse_args() -> argparse.Namespace:
|
| 310 |
+
parser = argparse.ArgumentParser(description="Download LexiMind training datasets")
|
| 311 |
+
parser.add_argument(
|
| 312 |
+
"--config", default="configs/data/datasets.yaml", help="Dataset config path"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 313 |
)
|
| 314 |
+
parser.add_argument(
|
| 315 |
+
"--skip-summarization", action="store_true", help="Skip summarization datasets"
|
| 316 |
)
|
| 317 |
+
parser.add_argument("--skip-emotion", action="store_true", help="Skip emotion dataset")
|
| 318 |
+
parser.add_argument("--skip-topic", action="store_true", help="Skip topic dataset")
|
| 319 |
+
parser.add_argument("--skip-books", action="store_true", help="Skip Gutenberg books")
|
| 320 |
+
return parser.parse_args()
|
| 321 |
+
|
| 322 |
|
| 323 |
+
def main() -> None:
|
| 324 |
+
args = parse_args()
|
| 325 |
+
|
| 326 |
+
# Load config
|
| 327 |
+
config_path = Path(args.config)
|
| 328 |
+
if not config_path.exists():
|
| 329 |
+
print(f"Config not found: {config_path}")
|
| 330 |
+
sys.exit(1)
|
| 331 |
+
|
| 332 |
+
config = load_yaml(str(config_path)).data
|
| 333 |
+
raw_paths = config.get("raw", {})
|
| 334 |
+
downloads = config.get("downloads", {})
|
| 335 |
+
|
| 336 |
+
print("=" * 60)
|
| 337 |
+
print("LexiMind Dataset Download")
|
| 338 |
+
print("=" * 60)
|
| 339 |
+
|
| 340 |
+
# Download emotion dataset
|
| 341 |
+
if not args.skip_emotion:
|
| 342 |
+
emotion_config = downloads.get("emotion", {})
|
| 343 |
+
emotion_dir = Path(raw_paths.get("emotion", "data/raw/emotion"))
|
| 344 |
+
download_emotion_dataset(emotion_dir, emotion_config)
|
| 345 |
+
|
| 346 |
+
# Download topic dataset
|
| 347 |
+
if not args.skip_topic:
|
| 348 |
+
topic_config = downloads.get("topic", {})
|
| 349 |
+
topic_dir = Path(raw_paths.get("topic", "data/raw/topic"))
|
| 350 |
+
download_topic_dataset(topic_dir, topic_config)
|
| 351 |
+
|
| 352 |
+
# Download summarization datasets
|
| 353 |
+
if not args.skip_summarization:
|
| 354 |
+
summ_config = downloads.get("summarization", [])
|
| 355 |
+
if isinstance(summ_config, list):
|
| 356 |
+
summ_dir = Path(raw_paths.get("summarization", "data/raw/summarization"))
|
| 357 |
+
download_summarization_datasets(summ_dir, summ_config)
|
| 358 |
+
|
| 359 |
+
# Download books
|
| 360 |
+
if not args.skip_books:
|
| 361 |
+
books_config = downloads.get("books", [])
|
| 362 |
+
if isinstance(books_config, list):
|
| 363 |
+
books_dir = Path(raw_paths.get("books", "data/raw/books"))
|
| 364 |
+
download_books(books_dir, books_config)
|
| 365 |
+
|
| 366 |
+
print("\n" + "=" * 60)
|
| 367 |
+
print("✅ Download complete!")
|
| 368 |
+
print("=" * 60)
|
| 369 |
|
| 370 |
|
| 371 |
if __name__ == "__main__":
|
scripts/preprocess_data.py
CHANGED
|
@@ -141,24 +141,42 @@ def preprocess_summarization(raw_dir: Path, processed_dir: Path) -> None:
|
|
| 141 |
return
|
| 142 |
|
| 143 |
for split in ("train", "validation", "test"):
|
| 144 |
-
|
| 145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
print(f"Skipping summarization split '{split}' (file not found)")
|
| 147 |
continue
|
| 148 |
|
| 149 |
output_path = processed_dir / f"{split}.jsonl"
|
| 150 |
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 151 |
print(f"Writing summarization split '{split}' to {output_path}")
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
|
| 163 |
|
| 164 |
def preprocess_emotion(raw_dir: Path, processed_dir: Path, cleaner: BasicTextCleaner) -> None:
|
|
|
|
| 141 |
return
|
| 142 |
|
| 143 |
for split in ("train", "validation", "test"):
|
| 144 |
+
# Check for JSONL first (from new download script), then CSV (legacy)
|
| 145 |
+
jsonl_path = raw_dir / f"{split}.jsonl"
|
| 146 |
+
csv_path = _resolve_csv(raw_dir, f"{split}.csv")
|
| 147 |
+
|
| 148 |
+
if jsonl_path.exists():
|
| 149 |
+
source_path = jsonl_path
|
| 150 |
+
is_jsonl = True
|
| 151 |
+
elif csv_path is not None:
|
| 152 |
+
source_path = csv_path
|
| 153 |
+
is_jsonl = False
|
| 154 |
+
else:
|
| 155 |
print(f"Skipping summarization split '{split}' (file not found)")
|
| 156 |
continue
|
| 157 |
|
| 158 |
output_path = processed_dir / f"{split}.jsonl"
|
| 159 |
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 160 |
print(f"Writing summarization split '{split}' to {output_path}")
|
| 161 |
+
|
| 162 |
+
with output_path.open("w", encoding="utf-8") as sink:
|
| 163 |
+
if is_jsonl:
|
| 164 |
+
# Process JSONL format (from new download script)
|
| 165 |
+
for row in _read_jsonl(source_path):
|
| 166 |
+
source = str(row.get("source") or row.get("article") or "")
|
| 167 |
+
summary = str(row.get("summary") or row.get("highlights") or "")
|
| 168 |
+
if source and summary:
|
| 169 |
+
payload = {"source": source.strip(), "summary": summary.strip()}
|
| 170 |
+
sink.write(json.dumps(payload, ensure_ascii=False) + "\n")
|
| 171 |
+
else:
|
| 172 |
+
# Process CSV format (legacy)
|
| 173 |
+
with source_path.open("r", encoding="utf-8", newline="") as source_handle:
|
| 174 |
+
reader = csv.DictReader(source_handle)
|
| 175 |
+
for row in reader:
|
| 176 |
+
article = str(row.get("article") or row.get("Article") or "")
|
| 177 |
+
highlights = str(row.get("highlights") or row.get("summary") or "")
|
| 178 |
+
payload = {"source": article.strip(), "summary": highlights.strip()}
|
| 179 |
+
sink.write(json.dumps(payload, ensure_ascii=False) + "\n")
|
| 180 |
|
| 181 |
|
| 182 |
def preprocess_emotion(raw_dir: Path, processed_dir: Path, cleaner: BasicTextCleaner) -> None:
|