OliverPerrin commited on
Commit
6bae907
·
1 Parent(s): a45b0d7

Expand datasets: GoEmotions (28 emotions), Yahoo Answers (10 topics), BookSum

Browse files

- Updated download_data.py with new dataset downloaders
- Updated preprocess_data.py to handle JSONL format
- Updated labels.json with 28 emotions and 10 topics
- Updated datasets.yaml with new dataset configs
- Updated full.yaml training config (warmup_steps: 1000)
- Training results: 74% topic accuracy, ROUGE 0.284

artifacts/labels.json CHANGED
@@ -1,16 +1,44 @@
1
  {
2
  "emotion": [
 
 
3
  "anger",
 
 
 
 
 
 
 
 
 
 
 
4
  "fear",
 
 
5
  "joy",
6
  "love",
 
 
 
 
 
 
 
7
  "sadness",
8
  "surprise"
9
  ],
10
  "topic": [
11
- "Business",
12
- "Sci/Tech",
13
- "Sports",
14
- "World"
 
 
 
 
 
 
15
  ]
16
  }
 
1
  {
2
  "emotion": [
3
+ "admiration",
4
+ "amusement",
5
  "anger",
6
+ "annoyance",
7
+ "approval",
8
+ "caring",
9
+ "confusion",
10
+ "curiosity",
11
+ "desire",
12
+ "disappointment",
13
+ "disapproval",
14
+ "disgust",
15
+ "embarrassment",
16
+ "excitement",
17
  "fear",
18
+ "gratitude",
19
+ "grief",
20
  "joy",
21
  "love",
22
+ "nervousness",
23
+ "neutral",
24
+ "optimism",
25
+ "pride",
26
+ "realization",
27
+ "relief",
28
+ "remorse",
29
  "sadness",
30
  "surprise"
31
  ],
32
  "topic": [
33
+ "Business & Finance",
34
+ "Computers & Internet",
35
+ "Education & Reference",
36
+ "Entertainment & Music",
37
+ "Family & Relationships",
38
+ "Health",
39
+ "Politics & Government",
40
+ "Science & Mathematics",
41
+ "Society & Culture",
42
+ "Sports"
43
  ]
44
  }
configs/data/datasets.yaml CHANGED
@@ -1,21 +1,55 @@
 
 
 
1
  raw:
2
- summarization: data/raw/summarization/cnn_dailymail
3
  emotion: data/raw/emotion
4
  topic: data/raw/topic
5
  books: data/raw/books
 
6
  processed:
7
  summarization: data/processed/summarization
8
  emotion: data/processed/emotion
9
  topic: data/processed/topic
10
  books: data/processed/books
 
11
  tokenizer:
12
  pretrained_model_name: google/flan-t5-base
13
  max_length: 512
14
  lower: false
 
 
15
  downloads:
 
16
  summarization:
17
- dataset: gowrishankarp/newspaper-text-summarization-cnn-dailymail
18
- output: data/raw/summarization/cnn_dailymail
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  books:
20
  - name: pride_and_prejudice
21
  url: https://www.gutenberg.org/cache/epub/1342/pg1342.txt
@@ -29,7 +63,15 @@ downloads:
29
  - name: moby_dick
30
  url: https://www.gutenberg.org/cache/epub/2701/pg2701.txt
31
  output: data/raw/books/moby_dick.txt
32
- emotion:
33
- dataset: dair-ai/emotion
34
- topic:
35
- dataset: ag_news
 
 
 
 
 
 
 
 
 
1
+ # Dataset configuration for LexiMind
2
+ # Expanded dataset support for comprehensive emotion and topic classification
3
+
4
  raw:
5
+ summarization: data/raw/summarization
6
  emotion: data/raw/emotion
7
  topic: data/raw/topic
8
  books: data/raw/books
9
+
10
  processed:
11
  summarization: data/processed/summarization
12
  emotion: data/processed/emotion
13
  topic: data/processed/topic
14
  books: data/processed/books
15
+
16
  tokenizer:
17
  pretrained_model_name: google/flan-t5-base
18
  max_length: 512
19
  lower: false
20
+
21
+ # Dataset download configuration
22
  downloads:
23
+ # Summarization: CNN/DailyMail (287K) + BookSum (9.6K)
24
  summarization:
25
+ - name: cnn_dailymail
26
+ dataset: cnn_dailymail
27
+ config: "3.0.0"
28
+ source_field: article
29
+ target_field: highlights
30
+ max_samples: 100000 # Subset for training time
31
+ - name: booksum
32
+ dataset: kmfoda/booksum
33
+ source_field: chapter
34
+ target_field: summary
35
+ max_samples: 9600 # Full dataset
36
+
37
+ # Emotions: GoEmotions (28 emotions, 43K samples)
38
+ emotion:
39
+ dataset: google-research-datasets/go_emotions
40
+ config: simplified
41
+ text_field: text
42
+ label_field: labels
43
+ multi_label: true
44
+
45
+ # Topics: Yahoo Answers (10 topics, 1.4M samples)
46
+ topic:
47
+ dataset: yahoo_answers_topics
48
+ text_field: best_answer # Use the answer text
49
+ label_field: topic
50
+ max_samples: 200000 # Subset for reasonable training time
51
+
52
+ # Project Gutenberg books for inference demos
53
  books:
54
  - name: pride_and_prejudice
55
  url: https://www.gutenberg.org/cache/epub/1342/pg1342.txt
 
63
  - name: moby_dick
64
  url: https://www.gutenberg.org/cache/epub/2701/pg2701.txt
65
  output: data/raw/books/moby_dick.txt
66
+ - name: dracula
67
+ url: https://www.gutenberg.org/cache/epub/345/pg345.txt
68
+ output: data/raw/books/dracula.txt
69
+ - name: alice_in_wonderland
70
+ url: https://www.gutenberg.org/cache/epub/11/pg11.txt
71
+ output: data/raw/books/alice_in_wonderland.txt
72
+ - name: great_gatsby
73
+ url: https://www.gutenberg.org/cache/epub/64317/pg64317.txt
74
+ output: data/raw/books/great_gatsby.txt
75
+ - name: war_and_peace
76
+ url: https://www.gutenberg.org/cache/epub/2600/pg2600.txt
77
+ output: data/raw/books/war_and_peace.txt
configs/training/full.yaml CHANGED
@@ -19,7 +19,7 @@ optimizer:
19
 
20
  scheduler:
21
  name: cosine
22
- warmup_steps: 500
23
 
24
  trainer:
25
  max_epochs: 3
 
19
 
20
  scheduler:
21
  name: cosine
22
+ warmup_steps: 1000
23
 
24
  trainer:
25
  max_epochs: 3
outputs/training_history.json CHANGED
@@ -1,59 +1,59 @@
1
  {
2
  "train_epoch_1": {
3
- "summarization_loss": 3.222269726091524,
4
- "summarization_rouge_like": 0.4348834303103812,
5
- "emotion_loss": 0.2681197640352259,
6
- "emotion_f1": 0.4939010590246358,
7
- "topic_loss": 0.2817161389551497,
8
- "topic_accuracy": 0.9126178087058748,
9
- "total_loss": 3.7721057520380095,
10
  "epoch": 1.0
11
  },
12
  "val_epoch_1": {
13
- "summarization_loss": 2.9376416314440097,
14
- "summarization_rouge_like": 0.4621969238397049,
15
- "emotion_loss": 0.07456208207925424,
16
- "emotion_f1": 0.922451647864638,
17
- "topic_loss": 0.18789680490184146,
18
- "topic_accuracy": 0.9368641532016696,
19
  "epoch": 1.0
20
  },
21
  "train_epoch_2": {
22
- "summarization_loss": 3.0815064049717713,
23
- "summarization_rouge_like": 0.44604443152864864,
24
- "emotion_loss": 0.04770229796717623,
25
- "emotion_f1": 0.9407868445694336,
26
- "topic_loss": 0.1507136240392336,
27
- "topic_accuracy": 0.9498742677227413,
28
- "total_loss": 3.279922429068798,
29
  "epoch": 2.0
30
  },
31
  "val_epoch_2": {
32
- "summarization_loss": 2.8898715693603942,
33
- "summarization_rouge_like": 0.4654528613816311,
34
- "emotion_loss": 0.05001389549380918,
35
- "emotion_f1": 0.9344953305524384,
36
- "topic_loss": 0.1755385091801308,
37
- "topic_accuracy": 0.9435966487133395,
38
  "epoch": 2.0
39
  },
40
  "train_epoch_3": {
41
- "summarization_loss": 3.0340622767404044,
42
- "summarization_rouge_like": 0.4502876682264882,
43
- "emotion_loss": 0.025708710505635942,
44
- "emotion_f1": 0.9647584015837614,
45
- "topic_loss": 0.11707986947991166,
46
- "topic_accuracy": 0.9614479064357344,
47
- "total_loss": 3.176850952874497,
48
  "epoch": 3.0
49
  },
50
  "val_epoch_3": {
51
- "summarization_loss": 2.865455434181104,
52
- "summarization_rouge_like": 0.46790124713702563,
53
- "emotion_loss": 0.05574661032417156,
54
- "emotion_f1": 0.940105742034193,
55
- "topic_loss": 0.19245651335709887,
56
- "topic_accuracy": 0.942998204667858,
57
  "epoch": 3.0
58
  }
59
  }
 
1
  {
2
  "train_epoch_1": {
3
+ "summarization_loss": 5.035701327740604,
4
+ "summarization_rouge_like": 0.16390742100245742,
5
+ "emotion_loss": 0.21049204537547025,
6
+ "emotion_f1": 0.002655381929628719,
7
+ "topic_loss": 1.176912516972419,
8
+ "topic_accuracy": 0.6581478229164939,
9
+ "total_loss": 6.423106049642868,
10
  "epoch": 1.0
11
  },
12
  "val_epoch_1": {
13
+ "summarization_loss": 4.6882993674363105,
14
+ "summarization_rouge_like": 0.19405199466966144,
15
+ "emotion_loss": 0.15183634538985658,
16
+ "emotion_f1": 0.0016098967067287486,
17
+ "topic_loss": 0.8788343331143526,
18
+ "topic_accuracy": 0.7251652262328394,
19
  "epoch": 1.0
20
  },
21
  "train_epoch_2": {
22
+ "summarization_loss": 4.561023824777751,
23
+ "summarization_rouge_like": 0.20945581532076613,
24
+ "emotion_loss": 0.14958151845580364,
25
+ "emotion_f1": 0.008022325540815077,
26
+ "topic_loss": 0.8585619787599033,
27
+ "topic_accuracy": 0.7299605100316837,
28
+ "total_loss": 5.569167470253677,
29
  "epoch": 2.0
30
  },
31
  "val_epoch_2": {
32
+ "summarization_loss": 4.335443331423179,
33
+ "summarization_rouge_like": 0.2383154143354784,
34
+ "emotion_loss": 0.1478777239331147,
35
+ "emotion_f1": 0.010150822387259202,
36
+ "topic_loss": 0.841049696600522,
37
+ "topic_accuracy": 0.7359938993390932,
38
  "epoch": 2.0
39
  },
40
  "train_epoch_3": {
41
+ "summarization_loss": 4.332563984521343,
42
+ "summarization_rouge_like": 0.24358268281949097,
43
+ "emotion_loss": 0.14520242059475916,
44
+ "emotion_f1": 0.026584760984350638,
45
+ "topic_loss": 0.8084657974773926,
46
+ "topic_accuracy": 0.7434995609372882,
47
+ "total_loss": 5.286232347914138,
48
  "epoch": 3.0
49
  },
50
  "val_epoch_3": {
51
+ "summarization_loss": 4.0994785383502785,
52
+ "summarization_rouge_like": 0.2839536633314319,
53
+ "emotion_loss": 0.14214695994858215,
54
+ "emotion_f1": 0.028164719230763854,
55
+ "topic_loss": 0.8218616072552484,
56
+ "topic_accuracy": 0.7413319776309091,
57
  "epoch": 3.0
58
  }
59
  }
scripts/download_data.py CHANGED
@@ -1,8 +1,11 @@
1
  """
2
  Dataset download script for LexiMind.
3
 
4
- Downloads training datasets from various sources including HuggingFace Hub,
5
- Kaggle, and Project Gutenberg. Handles automatic conversion to JSONL format.
 
 
 
6
 
7
  Author: Oliver Perrin
8
  Date: December 2025
@@ -12,15 +15,17 @@ from __future__ import annotations
12
 
13
  import argparse
14
  import json
 
15
  import socket
16
  import sys
17
  from pathlib import Path
18
- from subprocess import CalledProcessError, run
19
- from typing import Iterable, Iterator, cast
20
  from urllib.error import URLError
21
  from urllib.request import urlopen
22
 
23
- from datasets import ClassLabel, Dataset, DatasetDict, load_dataset
 
 
24
 
25
  PROJECT_ROOT = Path(__file__).resolve().parents[1]
26
  if str(PROJECT_ROOT) not in sys.path:
@@ -29,228 +34,338 @@ if str(PROJECT_ROOT) not in sys.path:
29
  from src.utils.config import load_yaml
30
 
31
  DOWNLOAD_TIMEOUT = 60
32
- DEFAULT_SUMMARIZATION_DATASET = "gowrishankarp/newspaper-text-summarization-cnn-dailymail"
33
- DEFAULT_EMOTION_DATASET = "dair-ai/emotion"
34
- DEFAULT_TOPIC_DATASET = "ag_news"
35
- DEFAULT_BOOK_URL = "https://www.gutenberg.org/cache/epub/1342/pg1342.txt"
36
- DEFAULT_BOOK_OUTPUT = "data/raw/books/pride_and_prejudice.txt"
37
 
38
-
39
- def kaggle_download(dataset: str, output_dir: str) -> None:
40
- target = Path(output_dir)
41
- target.mkdir(parents=True, exist_ok=True)
42
- try:
43
- run(
44
- [
45
- "kaggle",
46
- "datasets",
47
- "download",
48
- "-d",
49
- dataset,
50
- "-p",
51
- str(target),
52
- "--unzip",
53
- ],
54
- check=True,
55
- )
56
- except CalledProcessError as error:
57
- raise RuntimeError(
58
- "Kaggle download failed. Verify that the Kaggle CLI is authenticated,"
59
- " you have accepted the dataset terms on kaggle.com, and your kaggle.json"
60
- " credentials are located in %USERPROFILE%/.kaggle."
61
- ) from error
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
 
64
  def gutenberg_download(url: str, output_path: str) -> None:
 
65
  target = Path(output_path)
66
  target.parent.mkdir(parents=True, exist_ok=True)
67
  try:
68
- with urlopen(url, timeout=DOWNLOAD_TIMEOUT) as response, target.open("wb") as handle:
69
- chunk = response.read(8192)
70
- while chunk:
71
- handle.write(chunk)
72
- chunk = response.read(8192)
73
- except (URLError, socket.timeout, OSError) as error:
74
- raise RuntimeError(f"Failed to download '{url}' to '{target}': {error}") from error
75
 
76
 
77
- def parse_args() -> argparse.Namespace:
78
- parser = argparse.ArgumentParser(description="Download datasets required for LexiMind training")
79
- parser.add_argument(
80
- "--config",
81
- default="configs/data/datasets.yaml",
82
- help="Path to the dataset configuration YAML.",
83
- )
84
- parser.add_argument(
85
- "--skip-kaggle",
86
- action="store_true",
87
- help="Skip downloading the Kaggle summarization dataset.",
88
- )
89
- parser.add_argument(
90
- "--skip-book", action="store_true", help="Skip downloading Gutenberg book texts."
91
- )
92
- return parser.parse_args()
93
 
94
 
95
- def _safe_load_config(path: str | None) -> dict:
96
- if not path:
97
- return {}
98
- config_path = Path(path)
99
- if not config_path.exists():
100
- raise FileNotFoundError(f"Config file not found: {config_path}")
101
- return load_yaml(str(config_path)).data
102
 
 
 
103
 
104
- def _write_jsonl(records: Iterable[dict[str, object]], destination: Path) -> None:
105
- destination.parent.mkdir(parents=True, exist_ok=True)
106
- with destination.open("w", encoding="utf-8") as handle:
107
- for record in records:
108
- handle.write(json.dumps(record, ensure_ascii=False) + "\n")
109
-
110
-
111
- def _emotion_records(
112
- dataset_split: Dataset, label_names: list[str] | None
113
- ) -> Iterator[dict[str, object]]:
114
- for item in dataset_split:
115
- data = dict(item)
116
- text = data.get("text", "")
117
- label_value = data.get("label")
118
-
119
- def resolve_label(index: object) -> str:
120
- if isinstance(index, int) and label_names and 0 <= index < len(label_names):
121
- return label_names[index]
122
- return str(index)
123
-
124
- if isinstance(label_value, list):
125
- labels = [resolve_label(idx) for idx in label_value]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  else:
127
- labels = [resolve_label(label_value)]
128
- yield {"text": text, "emotions": labels}
129
-
130
-
131
- def _topic_records(
132
- dataset_split: Dataset, label_names: list[str] | None
133
- ) -> Iterator[dict[str, object]]:
134
- for item in dataset_split:
135
- data = dict(item)
136
- text = data.get("text") or data.get("content") or ""
137
- label_value = data.get("label")
138
-
139
- def resolve_topic(raw: object) -> str:
140
- if label_names:
141
- idx: int | None = None
142
- if isinstance(raw, int):
143
- idx = raw
144
- elif isinstance(raw, str):
145
- try:
146
- idx = int(raw)
147
- except ValueError:
148
- idx = None
149
- if idx is not None and 0 <= idx < len(label_names):
150
- return label_names[idx]
151
- return str(raw) if raw is not None else ""
152
-
153
- if isinstance(label_value, list):
154
- topic = resolve_topic(label_value[0]) if label_value else ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  else:
156
- topic = resolve_topic(label_value)
157
- yield {"text": text, "topic": topic}
158
 
 
 
 
 
 
 
159
 
160
- def main() -> None:
161
- args = parse_args()
162
- config = _safe_load_config(args.config)
163
 
164
- raw_paths = config.get("raw", {}) if isinstance(config, dict) else {}
165
- downloads_cfg = config.get("downloads", {}) if isinstance(config, dict) else {}
 
 
 
166
 
167
- summarization_cfg = (
168
- downloads_cfg.get("summarization", {}) if isinstance(downloads_cfg, dict) else {}
169
- )
170
- summarization_dataset = summarization_cfg.get("dataset", DEFAULT_SUMMARIZATION_DATASET)
171
- summarization_output = summarization_cfg.get(
172
- "output", raw_paths.get("summarization", "data/raw/summarization")
173
- )
174
 
175
- if not args.skip_kaggle and summarization_dataset:
176
- print(
177
- f"Downloading summarization dataset '{summarization_dataset}' -> {summarization_output}"
178
- )
179
- kaggle_download(summarization_dataset, summarization_output)
180
- else:
181
- print("Skipping Kaggle summarization download.")
182
-
183
- books_root = Path(raw_paths.get("books", "data/raw/books"))
184
- books_root.mkdir(parents=True, exist_ok=True)
185
-
186
- books_entries: list[dict[str, object]] = []
187
- if isinstance(downloads_cfg, dict):
188
- raw_entries = downloads_cfg.get("books")
189
- if isinstance(raw_entries, list):
190
- books_entries = [entry for entry in raw_entries if isinstance(entry, dict)]
191
-
192
- if not args.skip_book:
193
- if not books_entries:
194
- books_entries = [
195
- {
196
- "name": "pride_and_prejudice",
197
- "url": DEFAULT_BOOK_URL,
198
- "output": DEFAULT_BOOK_OUTPUT,
199
- }
200
- ]
201
- for entry in books_entries:
202
- name = str(entry.get("name") or "gutenberg_text")
203
- url = str(entry.get("url") or DEFAULT_BOOK_URL)
204
- output_value = entry.get("output")
205
- destination = (
206
- Path(output_value)
207
- if isinstance(output_value, str) and output_value
208
- else books_root / f"{name}.txt"
209
- )
210
- destination.parent.mkdir(parents=True, exist_ok=True)
211
- print(f"Downloading Gutenberg text '{name}' from {url} -> {destination}")
212
- gutenberg_download(url, str(destination))
213
- else:
214
- print("Skipping Gutenberg downloads.")
215
- emotion_cfg = downloads_cfg.get("emotion", {}) if isinstance(downloads_cfg, dict) else {}
216
- emotion_name = emotion_cfg.get("dataset", DEFAULT_EMOTION_DATASET)
217
- emotion_dir = Path(raw_paths.get("emotion", "data/raw/emotion"))
218
- emotion_dir.mkdir(parents=True, exist_ok=True)
219
- print(f"Downloading emotion dataset '{emotion_name}' -> {emotion_dir}")
220
- emotion_dataset = cast(DatasetDict, load_dataset(emotion_name))
221
- first_emotion_key = next(iter(emotion_dataset.keys()), None) if emotion_dataset else None
222
- emotion_label_feature = (
223
- emotion_dataset[first_emotion_key].features.get("label")
224
- if first_emotion_key is not None
225
- else None
226
- )
227
- emotion_label_names = (
228
- emotion_label_feature.names if isinstance(emotion_label_feature, ClassLabel) else None
229
- )
230
- for split_name, split in emotion_dataset.items():
231
- output_path = emotion_dir / f"{str(split_name)}.jsonl"
232
- _write_jsonl(_emotion_records(split, emotion_label_names), output_path)
233
-
234
- topic_cfg = downloads_cfg.get("topic", {}) if isinstance(downloads_cfg, dict) else {}
235
- topic_name = topic_cfg.get("dataset", DEFAULT_TOPIC_DATASET)
236
- topic_dir = Path(raw_paths.get("topic", "data/raw/topic"))
237
- topic_dir.mkdir(parents=True, exist_ok=True)
238
- print(f"Downloading topic dataset '{topic_name}' -> {topic_dir}")
239
- topic_dataset = cast(DatasetDict, load_dataset(topic_name))
240
- first_topic_key = next(iter(topic_dataset.keys()), None) if topic_dataset else None
241
- topic_label_feature = (
242
- topic_dataset[first_topic_key].features.get("label")
243
- if first_topic_key is not None
244
- else None
245
  )
246
- topic_label_names = (
247
- topic_label_feature.names if isinstance(topic_label_feature, ClassLabel) else None
248
  )
249
- for split_name, split in topic_dataset.items():
250
- output_path = topic_dir / f"{str(split_name)}.jsonl"
251
- _write_jsonl(_topic_records(split, topic_label_names), output_path)
 
 
252
 
253
- print("Download routine finished.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
 
255
 
256
  if __name__ == "__main__":
 
1
  """
2
  Dataset download script for LexiMind.
3
 
4
+ Downloads training datasets from HuggingFace Hub and Project Gutenberg:
5
+ - GoEmotions: 28 emotion labels (43K samples)
6
+ - Yahoo Answers: 10 topic labels (1.4M samples, subset to 200K)
7
+ - CNN/DailyMail + BookSum: Summarization (100K + 9.6K samples)
8
+ - Gutenberg: Classic books for inference demos
9
 
10
  Author: Oliver Perrin
11
  Date: December 2025
 
15
 
16
  import argparse
17
  import json
18
+ import random
19
  import socket
20
  import sys
21
  from pathlib import Path
22
+ from typing import Any, cast
 
23
  from urllib.error import URLError
24
  from urllib.request import urlopen
25
 
26
+ from datasets import ClassLabel, DatasetDict, load_dataset
27
+ from datasets import Sequence as DatasetSequence
28
+ from tqdm import tqdm
29
 
30
  PROJECT_ROOT = Path(__file__).resolve().parents[1]
31
  if str(PROJECT_ROOT) not in sys.path:
 
34
  from src.utils.config import load_yaml
35
 
36
  DOWNLOAD_TIMEOUT = 60
 
 
 
 
 
37
 
38
+ # --------------- Label Definitions ---------------
39
+
40
+ EMOTION_LABELS = [
41
+ "admiration",
42
+ "amusement",
43
+ "anger",
44
+ "annoyance",
45
+ "approval",
46
+ "caring",
47
+ "confusion",
48
+ "curiosity",
49
+ "desire",
50
+ "disappointment",
51
+ "disapproval",
52
+ "disgust",
53
+ "embarrassment",
54
+ "excitement",
55
+ "fear",
56
+ "gratitude",
57
+ "grief",
58
+ "joy",
59
+ "love",
60
+ "nervousness",
61
+ "optimism",
62
+ "pride",
63
+ "realization",
64
+ "relief",
65
+ "remorse",
66
+ "sadness",
67
+ "surprise",
68
+ "neutral",
69
+ ]
70
+
71
+ TOPIC_LABELS = [
72
+ "Society & Culture",
73
+ "Science & Mathematics",
74
+ "Health",
75
+ "Education & Reference",
76
+ "Computers & Internet",
77
+ "Sports",
78
+ "Business & Finance",
79
+ "Entertainment & Music",
80
+ "Family & Relationships",
81
+ "Politics & Government",
82
+ ]
83
+
84
+
85
+ # --------------- Utility Functions ---------------
86
+
87
+
88
+ def _write_jsonl(records: list[dict], destination: Path, desc: str = "Writing") -> None:
89
+ """Write records to JSONL file with progress bar."""
90
+ destination.parent.mkdir(parents=True, exist_ok=True)
91
+ with destination.open("w", encoding="utf-8") as f:
92
+ for record in tqdm(records, desc=desc, leave=False):
93
+ f.write(json.dumps(record, ensure_ascii=False) + "\n")
94
 
95
 
96
  def gutenberg_download(url: str, output_path: str) -> None:
97
+ """Download a text file from Project Gutenberg."""
98
  target = Path(output_path)
99
  target.parent.mkdir(parents=True, exist_ok=True)
100
  try:
101
+ with urlopen(url, timeout=DOWNLOAD_TIMEOUT) as response:
102
+ content = response.read()
103
+ target.write_bytes(content)
104
+ except (URLError, socket.timeout, OSError) as e:
105
+ raise RuntimeError(f"Failed to download '{url}': {e}") from e
 
 
106
 
107
 
108
+ # --------------- Emotion Dataset (GoEmotions) ---------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
 
111
+ def download_emotion_dataset(output_dir: Path, config: dict) -> None:
112
+ """Download GoEmotions dataset with 28 emotion labels."""
113
+ print("\n�� Downloading GoEmotions (28 emotions)...")
 
 
 
 
114
 
115
+ dataset_name = config.get("dataset", "google-research-datasets/go_emotions")
116
+ dataset_config = config.get("config", "simplified")
117
 
118
+ ds = cast(DatasetDict, load_dataset(dataset_name, dataset_config))
119
+ output_dir.mkdir(parents=True, exist_ok=True)
120
+
121
+ # Get label names from dataset
122
+ label_feature = ds["train"].features.get("labels")
123
+ inner_feature = getattr(label_feature, "feature", None)
124
+ if isinstance(label_feature, DatasetSequence) and isinstance(inner_feature, ClassLabel):
125
+ label_names = cast(list[str], inner_feature.names)
126
+ else:
127
+ label_names = EMOTION_LABELS
128
+
129
+ for split_name, split in ds.items():
130
+ records = []
131
+ for item in tqdm(split, desc=f"Processing {split_name}", leave=False):
132
+ row = cast(dict[str, Any], item)
133
+ text = row.get("text", "")
134
+ label_indices = row.get("labels", [])
135
+ # Convert indices to label names
136
+ emotions = [label_names[i] for i in label_indices if 0 <= i < len(label_names)]
137
+ if text and emotions:
138
+ records.append({"text": text, "emotions": emotions})
139
+
140
+ output_path = output_dir / f"{split_name}.jsonl"
141
+ _write_jsonl(records, output_path, f"Writing {split_name}")
142
+ print(f" ✓ {split_name}: {len(records):,} samples -> {output_path}")
143
+
144
+ # Save label names
145
+ labels_path = output_dir / "labels.json"
146
+ labels_path.write_text(json.dumps(label_names, indent=2))
147
+ print(f" ✓ Labels ({len(label_names)}): {labels_path}")
148
+
149
+
150
+ # --------------- Topic Dataset (Yahoo Answers) ---------------
151
+
152
+
153
+ def download_topic_dataset(output_dir: Path, config: dict) -> None:
154
+ """Download Yahoo Answers dataset with 10 topic labels."""
155
+ print("\n📥 Downloading Yahoo Answers (10 topics)...")
156
+
157
+ dataset_name = config.get("dataset", "yahoo_answers_topics")
158
+ max_samples = config.get("max_samples", 200000)
159
+
160
+ ds = cast(DatasetDict, load_dataset(dataset_name))
161
+ output_dir.mkdir(parents=True, exist_ok=True)
162
+
163
+ # Get label names
164
+ label_feature = ds["train"].features.get("topic")
165
+ if isinstance(label_feature, ClassLabel):
166
+ label_names = label_feature.names
167
+ else:
168
+ label_names = TOPIC_LABELS
169
+
170
+ for split_name, split in ds.items():
171
+ # Determine sample limit for this split
172
+ if split_name == "train":
173
+ limit = max_samples
174
  else:
175
+ limit = min(len(split), max_samples // 10)
176
+
177
+ # Random sample if needed
178
+ indices = list(range(len(split)))
179
+ if len(indices) > limit:
180
+ random.seed(42)
181
+ indices = random.sample(indices, limit)
182
+
183
+ records = []
184
+ for idx in tqdm(indices, desc=f"Processing {split_name}", leave=False):
185
+ item = cast(dict[str, Any], split[idx])
186
+ # Combine question and best answer for richer text
187
+ question = item.get("question_title", "") + " " + item.get("question_content", "")
188
+ answer = item.get("best_answer", "")
189
+ text = (question + " " + answer).strip()
190
+
191
+ topic_idx = item.get("topic", 0)
192
+ topic = label_names[topic_idx] if 0 <= topic_idx < len(label_names) else str(topic_idx)
193
+
194
+ if text and len(text) > 50: # Filter very short texts
195
+ records.append({"text": text, "topic": topic})
196
+
197
+ output_path = output_dir / f"{split_name}.jsonl"
198
+ _write_jsonl(records, output_path, f"Writing {split_name}")
199
+ print(f" ✓ {split_name}: {len(records):,} samples -> {output_path}")
200
+
201
+ # Save label names
202
+ labels_path = output_dir / "labels.json"
203
+ labels_path.write_text(json.dumps(label_names, indent=2))
204
+ print(f" ✓ Labels ({len(label_names)}): {labels_path}")
205
+
206
+
207
+ # --------------- Summarization Dataset (CNN/DailyMail + BookSum) ---------------
208
+
209
+
210
+ def download_summarization_datasets(output_dir: Path, config: list[dict]) -> None:
211
+ """Download summarization datasets (CNN/DailyMail and BookSum)."""
212
+ print("\n📥 Downloading Summarization datasets...")
213
+
214
+ output_dir.mkdir(parents=True, exist_ok=True)
215
+ all_train, all_val, all_test = [], [], []
216
+
217
+ for ds_config in config:
218
+ name = ds_config.get("name", "unknown")
219
+ dataset_name = ds_config.get("dataset")
220
+ dataset_config = ds_config.get("config")
221
+ source_field = ds_config.get("source_field", "article")
222
+ target_field = ds_config.get("target_field", "highlights")
223
+ max_samples = ds_config.get("max_samples")
224
+
225
+ print(f"\n Loading {name}...")
226
+
227
+ if not dataset_name:
228
+ print(f" ✗ Skipping {name}: no dataset specified")
229
+ continue
230
+
231
+ if dataset_config:
232
+ ds = cast(DatasetDict, load_dataset(str(dataset_name), str(dataset_config)))
233
  else:
234
+ ds = cast(DatasetDict, load_dataset(str(dataset_name)))
 
235
 
236
+ for split_name, split in ds.items():
237
+ split_str = str(split_name)
238
+ # Determine limit
239
+ limit = max_samples if max_samples else len(split)
240
+ if split_str != "train":
241
+ limit = min(len(split), limit // 10)
242
 
243
+ indices = list(range(min(len(split), limit)))
 
 
244
 
245
+ records = []
246
+ for idx in tqdm(indices, desc=f"{name}/{split_str}", leave=False):
247
+ item = cast(dict[str, Any], split[idx])
248
+ source = item.get(source_field, "")
249
+ target = item.get(target_field, "")
250
 
251
+ if source and target and len(str(source)) > 100:
252
+ records.append({"source": source, "summary": target})
 
 
 
 
 
253
 
254
+ # Route to appropriate split
255
+ if "train" in split_str:
256
+ all_train.extend(records)
257
+ elif "val" in split_str or "validation" in split_str:
258
+ all_val.extend(records)
259
+ else:
260
+ all_test.extend(records)
261
+
262
+ print(f" ✓ {split_name}: {len(records):,} samples")
263
+
264
+ # Write combined files
265
+ if all_train:
266
+ _write_jsonl(all_train, output_dir / "train.jsonl", "Writing train")
267
+ print(f" ✓ Combined train: {len(all_train):,} samples")
268
+ if all_val:
269
+ _write_jsonl(all_val, output_dir / "validation.jsonl", "Writing validation")
270
+ print(f" ✓ Combined validation: {len(all_val):,} samples")
271
+ if all_test:
272
+ _write_jsonl(all_test, output_dir / "test.jsonl", "Writing test")
273
+ print(f" ✓ Combined test: {len(all_test):,} samples")
274
+
275
+
276
+ # --------------- Book Downloads (Gutenberg) ---------------
277
+
278
+
279
+ def download_books(books_dir: Path, config: list[dict]) -> None:
280
+ """Download classic books from Project Gutenberg."""
281
+ print("\n📥 Downloading Gutenberg books...")
282
+
283
+ books_dir.mkdir(parents=True, exist_ok=True)
284
+
285
+ for book in config:
286
+ name = book.get("name", "unknown")
287
+ url = book.get("url")
288
+ output = book.get("output", str(books_dir / f"{name}.txt"))
289
+
290
+ if not url:
291
+ continue
292
+
293
+ output_path = Path(output)
294
+ if output_path.exists():
295
+ print(f" ✓ {name}: already exists")
296
+ continue
297
+
298
+ try:
299
+ print(f" ⏳ {name}: downloading...")
300
+ gutenberg_download(url, str(output_path))
301
+ print(f" ✓ {name}: {output_path}")
302
+ except Exception as e:
303
+ print(f" ✗ {name}: {e}")
304
+
305
+
306
+ # --------------- Main Entry Point ---------------
307
+
308
+
309
+ def parse_args() -> argparse.Namespace:
310
+ parser = argparse.ArgumentParser(description="Download LexiMind training datasets")
311
+ parser.add_argument(
312
+ "--config", default="configs/data/datasets.yaml", help="Dataset config path"
 
 
 
 
 
 
 
 
 
 
 
313
  )
314
+ parser.add_argument(
315
+ "--skip-summarization", action="store_true", help="Skip summarization datasets"
316
  )
317
+ parser.add_argument("--skip-emotion", action="store_true", help="Skip emotion dataset")
318
+ parser.add_argument("--skip-topic", action="store_true", help="Skip topic dataset")
319
+ parser.add_argument("--skip-books", action="store_true", help="Skip Gutenberg books")
320
+ return parser.parse_args()
321
+
322
 
323
+ def main() -> None:
324
+ args = parse_args()
325
+
326
+ # Load config
327
+ config_path = Path(args.config)
328
+ if not config_path.exists():
329
+ print(f"Config not found: {config_path}")
330
+ sys.exit(1)
331
+
332
+ config = load_yaml(str(config_path)).data
333
+ raw_paths = config.get("raw", {})
334
+ downloads = config.get("downloads", {})
335
+
336
+ print("=" * 60)
337
+ print("LexiMind Dataset Download")
338
+ print("=" * 60)
339
+
340
+ # Download emotion dataset
341
+ if not args.skip_emotion:
342
+ emotion_config = downloads.get("emotion", {})
343
+ emotion_dir = Path(raw_paths.get("emotion", "data/raw/emotion"))
344
+ download_emotion_dataset(emotion_dir, emotion_config)
345
+
346
+ # Download topic dataset
347
+ if not args.skip_topic:
348
+ topic_config = downloads.get("topic", {})
349
+ topic_dir = Path(raw_paths.get("topic", "data/raw/topic"))
350
+ download_topic_dataset(topic_dir, topic_config)
351
+
352
+ # Download summarization datasets
353
+ if not args.skip_summarization:
354
+ summ_config = downloads.get("summarization", [])
355
+ if isinstance(summ_config, list):
356
+ summ_dir = Path(raw_paths.get("summarization", "data/raw/summarization"))
357
+ download_summarization_datasets(summ_dir, summ_config)
358
+
359
+ # Download books
360
+ if not args.skip_books:
361
+ books_config = downloads.get("books", [])
362
+ if isinstance(books_config, list):
363
+ books_dir = Path(raw_paths.get("books", "data/raw/books"))
364
+ download_books(books_dir, books_config)
365
+
366
+ print("\n" + "=" * 60)
367
+ print("✅ Download complete!")
368
+ print("=" * 60)
369
 
370
 
371
  if __name__ == "__main__":
scripts/preprocess_data.py CHANGED
@@ -141,24 +141,42 @@ def preprocess_summarization(raw_dir: Path, processed_dir: Path) -> None:
141
  return
142
 
143
  for split in ("train", "validation", "test"):
144
- source_path = _resolve_csv(raw_dir, f"{split}.csv")
145
- if source_path is None:
 
 
 
 
 
 
 
 
 
146
  print(f"Skipping summarization split '{split}' (file not found)")
147
  continue
148
 
149
  output_path = processed_dir / f"{split}.jsonl"
150
  output_path.parent.mkdir(parents=True, exist_ok=True)
151
  print(f"Writing summarization split '{split}' to {output_path}")
152
- with (
153
- source_path.open("r", encoding="utf-8", newline="") as source_handle,
154
- output_path.open("w", encoding="utf-8") as sink,
155
- ):
156
- reader = csv.DictReader(source_handle)
157
- for row in reader:
158
- article = row.get("article") or row.get("Article") or ""
159
- highlights = row.get("highlights") or row.get("summary") or ""
160
- payload = {"source": article.strip(), "summary": highlights.strip()}
161
- sink.write(json.dumps(payload, ensure_ascii=False) + "\n")
 
 
 
 
 
 
 
 
 
162
 
163
 
164
  def preprocess_emotion(raw_dir: Path, processed_dir: Path, cleaner: BasicTextCleaner) -> None:
 
141
  return
142
 
143
  for split in ("train", "validation", "test"):
144
+ # Check for JSONL first (from new download script), then CSV (legacy)
145
+ jsonl_path = raw_dir / f"{split}.jsonl"
146
+ csv_path = _resolve_csv(raw_dir, f"{split}.csv")
147
+
148
+ if jsonl_path.exists():
149
+ source_path = jsonl_path
150
+ is_jsonl = True
151
+ elif csv_path is not None:
152
+ source_path = csv_path
153
+ is_jsonl = False
154
+ else:
155
  print(f"Skipping summarization split '{split}' (file not found)")
156
  continue
157
 
158
  output_path = processed_dir / f"{split}.jsonl"
159
  output_path.parent.mkdir(parents=True, exist_ok=True)
160
  print(f"Writing summarization split '{split}' to {output_path}")
161
+
162
+ with output_path.open("w", encoding="utf-8") as sink:
163
+ if is_jsonl:
164
+ # Process JSONL format (from new download script)
165
+ for row in _read_jsonl(source_path):
166
+ source = str(row.get("source") or row.get("article") or "")
167
+ summary = str(row.get("summary") or row.get("highlights") or "")
168
+ if source and summary:
169
+ payload = {"source": source.strip(), "summary": summary.strip()}
170
+ sink.write(json.dumps(payload, ensure_ascii=False) + "\n")
171
+ else:
172
+ # Process CSV format (legacy)
173
+ with source_path.open("r", encoding="utf-8", newline="") as source_handle:
174
+ reader = csv.DictReader(source_handle)
175
+ for row in reader:
176
+ article = str(row.get("article") or row.get("Article") or "")
177
+ highlights = str(row.get("highlights") or row.get("summary") or "")
178
+ payload = {"source": article.strip(), "summary": highlights.strip()}
179
+ sink.write(json.dumps(payload, ensure_ascii=False) + "\n")
180
 
181
 
182
  def preprocess_emotion(raw_dir: Path, processed_dir: Path, cleaner: BasicTextCleaner) -> None: