Spaces:

OliverPerrin
/

LexiMind

Sleeping

OliverPerrin commited on 30 days ago

Commit

6bae907

1 Parent(s): a45b0d7

Expand datasets: GoEmotions (28 emotions), Yahoo Answers (10 topics), BookSum

- Updated download_data.py with new dataset downloaders
- Updated preprocess_data.py to handle JSONL format
- Updated labels.json with 28 emotions and 10 topics
- Updated datasets.yaml with new dataset configs
- Updated full.yaml training config (warmup_steps: 1000)
- Training results: 74% topic accuracy, ROUGE 0.284

Files changed (6) hide show

artifacts/labels.json +32 -4
configs/data/datasets.yaml +49 -7
configs/training/full.yaml +1 -1
outputs/training_history.json +39 -39
scripts/download_data.py +319 -204
scripts/preprocess_data.py +30 -12

artifacts/labels.json CHANGED Viewed

@@ -1,16 +1,44 @@
 {
   "emotion": [
     "anger",
     "fear",
     "joy",
     "love",
     "sadness",
     "surprise"
   ],
   "topic": [
-    "Business",
-    "Sci/Tech",
-    "Sports",
-    "World"
   ]
 }

 {
   "emotion": [
+    "admiration",
+    "amusement",
     "anger",
+    "annoyance",
+    "approval",
+    "caring",
+    "confusion",
+    "curiosity",
+    "desire",
+    "disappointment",
+    "disapproval",
+    "disgust",
+    "embarrassment",
+    "excitement",
     "fear",
+    "gratitude",
+    "grief",
     "joy",
     "love",
+    "nervousness",
+    "neutral",
+    "optimism",
+    "pride",
+    "realization",
+    "relief",
+    "remorse",
     "sadness",
     "surprise"
   ],
   "topic": [
+    "Business & Finance",
+    "Computers & Internet",
+    "Education & Reference",
+    "Entertainment & Music",
+    "Family & Relationships",
+    "Health",
+    "Politics & Government",
+    "Science & Mathematics",
+    "Society & Culture",
+    "Sports"
   ]
 }

configs/data/datasets.yaml CHANGED Viewed

@@ -1,21 +1,55 @@
 raw:
-  summarization: data/raw/summarization/cnn_dailymail
   emotion: data/raw/emotion
   topic: data/raw/topic
   books: data/raw/books
 processed:
   summarization: data/processed/summarization
   emotion: data/processed/emotion
   topic: data/processed/topic
   books: data/processed/books
 tokenizer:
   pretrained_model_name: google/flan-t5-base
   max_length: 512
   lower: false
 downloads:
   summarization:
-    dataset: gowrishankarp/newspaper-text-summarization-cnn-dailymail
-    output: data/raw/summarization/cnn_dailymail
   books:
     - name: pride_and_prejudice
       url: https://www.gutenberg.org/cache/epub/1342/pg1342.txt
@@ -29,7 +63,15 @@ downloads:
     - name: moby_dick
       url: https://www.gutenberg.org/cache/epub/2701/pg2701.txt
       output: data/raw/books/moby_dick.txt
-  emotion:
-    dataset: dair-ai/emotion
-  topic:
-    dataset: ag_news

+# Dataset configuration for LexiMind
+# Expanded dataset support for comprehensive emotion and topic classification
 raw:
+  summarization: data/raw/summarization
   emotion: data/raw/emotion
   topic: data/raw/topic
   books: data/raw/books
 processed:
   summarization: data/processed/summarization
   emotion: data/processed/emotion
   topic: data/processed/topic
   books: data/processed/books
 tokenizer:
   pretrained_model_name: google/flan-t5-base
   max_length: 512
   lower: false
+# Dataset download configuration
 downloads:
+  # Summarization: CNN/DailyMail (287K) + BookSum (9.6K)
   summarization:
+    - name: cnn_dailymail
+      dataset: cnn_dailymail
+      config: "3.0.0"
+      source_field: article
+      target_field: highlights
+      max_samples: 100000  # Subset for training time
+    - name: booksum
+      dataset: kmfoda/booksum
+      source_field: chapter
+      target_field: summary
+      max_samples: 9600  # Full dataset
+  # Emotions: GoEmotions (28 emotions, 43K samples)
+  emotion:
+    dataset: google-research-datasets/go_emotions
+    config: simplified
+    text_field: text
+    label_field: labels
+    multi_label: true
+  # Topics: Yahoo Answers (10 topics, 1.4M samples)
+  topic:
+    dataset: yahoo_answers_topics
+    text_field: best_answer  # Use the answer text
+    label_field: topic
+    max_samples: 200000  # Subset for reasonable training time
+  # Project Gutenberg books for inference demos
   books:
     - name: pride_and_prejudice
       url: https://www.gutenberg.org/cache/epub/1342/pg1342.txt
     - name: moby_dick
       url: https://www.gutenberg.org/cache/epub/2701/pg2701.txt
       output: data/raw/books/moby_dick.txt
+    - name: dracula
+      url: https://www.gutenberg.org/cache/epub/345/pg345.txt
+      output: data/raw/books/dracula.txt
+    - name: alice_in_wonderland
+      url: https://www.gutenberg.org/cache/epub/11/pg11.txt
+      output: data/raw/books/alice_in_wonderland.txt
+    - name: great_gatsby
+      url: https://www.gutenberg.org/cache/epub/64317/pg64317.txt
+      output: data/raw/books/great_gatsby.txt
+    - name: war_and_peace
+      url: https://www.gutenberg.org/cache/epub/2600/pg2600.txt
+      output: data/raw/books/war_and_peace.txt

configs/training/full.yaml CHANGED Viewed

@@ -19,7 +19,7 @@ optimizer:
 scheduler:
   name: cosine
-  warmup_steps: 500
 trainer:
   max_epochs: 3

 scheduler:
   name: cosine
+  warmup_steps: 1000
 trainer:
   max_epochs: 3

outputs/training_history.json CHANGED Viewed

@@ -1,59 +1,59 @@
 {
   "train_epoch_1": {
-    "summarization_loss": 3.222269726091524,
-    "summarization_rouge_like": 0.4348834303103812,
-    "emotion_loss": 0.2681197640352259,
-    "emotion_f1": 0.4939010590246358,
-    "topic_loss": 0.2817161389551497,
-    "topic_accuracy": 0.9126178087058748,
-    "total_loss": 3.7721057520380095,
     "epoch": 1.0
   },
   "val_epoch_1": {
-    "summarization_loss": 2.9376416314440097,
-    "summarization_rouge_like": 0.4621969238397049,
-    "emotion_loss": 0.07456208207925424,
-    "emotion_f1": 0.922451647864638,
-    "topic_loss": 0.18789680490184146,
-    "topic_accuracy": 0.9368641532016696,
     "epoch": 1.0
   },
   "train_epoch_2": {
-    "summarization_loss": 3.0815064049717713,
-    "summarization_rouge_like": 0.44604443152864864,
-    "emotion_loss": 0.04770229796717623,
-    "emotion_f1": 0.9407868445694336,
-    "topic_loss": 0.1507136240392336,
-    "topic_accuracy": 0.9498742677227413,
-    "total_loss": 3.279922429068798,
     "epoch": 2.0
   },
   "val_epoch_2": {
-    "summarization_loss": 2.8898715693603942,
-    "summarization_rouge_like": 0.4654528613816311,
-    "emotion_loss": 0.05001389549380918,
-    "emotion_f1": 0.9344953305524384,
-    "topic_loss": 0.1755385091801308,
-    "topic_accuracy": 0.9435966487133395,
     "epoch": 2.0
   },
   "train_epoch_3": {
-    "summarization_loss": 3.0340622767404044,
-    "summarization_rouge_like": 0.4502876682264882,
-    "emotion_loss": 0.025708710505635942,
-    "emotion_f1": 0.9647584015837614,
-    "topic_loss": 0.11707986947991166,
-    "topic_accuracy": 0.9614479064357344,
-    "total_loss": 3.176850952874497,
     "epoch": 3.0
   },
   "val_epoch_3": {
-    "summarization_loss": 2.865455434181104,
-    "summarization_rouge_like": 0.46790124713702563,
-    "emotion_loss": 0.05574661032417156,
-    "emotion_f1": 0.940105742034193,
-    "topic_loss": 0.19245651335709887,
-    "topic_accuracy": 0.942998204667858,
     "epoch": 3.0
   }
 }

 {
   "train_epoch_1": {
+    "summarization_loss": 5.035701327740604,
+    "summarization_rouge_like": 0.16390742100245742,
+    "emotion_loss": 0.21049204537547025,
+    "emotion_f1": 0.002655381929628719,
+    "topic_loss": 1.176912516972419,
+    "topic_accuracy": 0.6581478229164939,
+    "total_loss": 6.423106049642868,
     "epoch": 1.0
   },
   "val_epoch_1": {
+    "summarization_loss": 4.6882993674363105,
+    "summarization_rouge_like": 0.19405199466966144,
+    "emotion_loss": 0.15183634538985658,
+    "emotion_f1": 0.0016098967067287486,
+    "topic_loss": 0.8788343331143526,
+    "topic_accuracy": 0.7251652262328394,
     "epoch": 1.0
   },
   "train_epoch_2": {
+    "summarization_loss": 4.561023824777751,
+    "summarization_rouge_like": 0.20945581532076613,
+    "emotion_loss": 0.14958151845580364,
+    "emotion_f1": 0.008022325540815077,
+    "topic_loss": 0.8585619787599033,
+    "topic_accuracy": 0.7299605100316837,
+    "total_loss": 5.569167470253677,
     "epoch": 2.0
   },
   "val_epoch_2": {
+    "summarization_loss": 4.335443331423179,
+    "summarization_rouge_like": 0.2383154143354784,
+    "emotion_loss": 0.1478777239331147,
+    "emotion_f1": 0.010150822387259202,
+    "topic_loss": 0.841049696600522,
+    "topic_accuracy": 0.7359938993390932,
     "epoch": 2.0
   },
   "train_epoch_3": {
+    "summarization_loss": 4.332563984521343,
+    "summarization_rouge_like": 0.24358268281949097,
+    "emotion_loss": 0.14520242059475916,
+    "emotion_f1": 0.026584760984350638,
+    "topic_loss": 0.8084657974773926,
+    "topic_accuracy": 0.7434995609372882,
+    "total_loss": 5.286232347914138,
     "epoch": 3.0
   },
   "val_epoch_3": {
+    "summarization_loss": 4.0994785383502785,
+    "summarization_rouge_like": 0.2839536633314319,
+    "emotion_loss": 0.14214695994858215,
+    "emotion_f1": 0.028164719230763854,
+    "topic_loss": 0.8218616072552484,
+    "topic_accuracy": 0.7413319776309091,
     "epoch": 3.0
   }
 }

scripts/download_data.py CHANGED Viewed

@@ -1,8 +1,11 @@
 """
 Dataset download script for LexiMind.
-Downloads training datasets from various sources including HuggingFace Hub,
-Kaggle, and Project Gutenberg. Handles automatic conversion to JSONL format.
 Author: Oliver Perrin
 Date: December 2025
@@ -12,15 +15,17 @@ from __future__ import annotations
 import argparse
 import json
 import socket
 import sys
 from pathlib import Path
-from subprocess import CalledProcessError, run
-from typing import Iterable, Iterator, cast
 from urllib.error import URLError
 from urllib.request import urlopen
-from datasets import ClassLabel, Dataset, DatasetDict, load_dataset
 PROJECT_ROOT = Path(__file__).resolve().parents[1]
 if str(PROJECT_ROOT) not in sys.path:
@@ -29,228 +34,338 @@ if str(PROJECT_ROOT) not in sys.path:
 from src.utils.config import load_yaml
 DOWNLOAD_TIMEOUT = 60
-DEFAULT_SUMMARIZATION_DATASET = "gowrishankarp/newspaper-text-summarization-cnn-dailymail"
-DEFAULT_EMOTION_DATASET = "dair-ai/emotion"
-DEFAULT_TOPIC_DATASET = "ag_news"
-DEFAULT_BOOK_URL = "https://www.gutenberg.org/cache/epub/1342/pg1342.txt"
-DEFAULT_BOOK_OUTPUT = "data/raw/books/pride_and_prejudice.txt"
-def kaggle_download(dataset: str, output_dir: str) -> None:
-    target = Path(output_dir)
-    target.mkdir(parents=True, exist_ok=True)
-    try:
-        run(
-            [
-                "kaggle",
-                "datasets",
-                "download",
-                "-d",
-                dataset,
-                "-p",
-                str(target),
-                "--unzip",
-            ],
-            check=True,
-        )
-    except CalledProcessError as error:
-        raise RuntimeError(
-            "Kaggle download failed. Verify that the Kaggle CLI is authenticated,"
-            " you have accepted the dataset terms on kaggle.com, and your kaggle.json"
-            " credentials are located in %USERPROFILE%/.kaggle."
-        ) from error
 def gutenberg_download(url: str, output_path: str) -> None:
     target = Path(output_path)
     target.parent.mkdir(parents=True, exist_ok=True)
     try:
-        with urlopen(url, timeout=DOWNLOAD_TIMEOUT) as response, target.open("wb") as handle:
-            chunk = response.read(8192)
-            while chunk:
-                handle.write(chunk)
-                chunk = response.read(8192)
-    except (URLError, socket.timeout, OSError) as error:
-        raise RuntimeError(f"Failed to download '{url}' to '{target}': {error}") from error
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(description="Download datasets required for LexiMind training")
-    parser.add_argument(
-        "--config",
-        default="configs/data/datasets.yaml",
-        help="Path to the dataset configuration YAML.",
-    )
-    parser.add_argument(
-        "--skip-kaggle",
-        action="store_true",
-        help="Skip downloading the Kaggle summarization dataset.",
-    )
-    parser.add_argument(
-        "--skip-book", action="store_true", help="Skip downloading Gutenberg book texts."
-    )
-    return parser.parse_args()
-def _safe_load_config(path: str | None) -> dict:
-    if not path:
-        return {}
-    config_path = Path(path)
-    if not config_path.exists():
-        raise FileNotFoundError(f"Config file not found: {config_path}")
-    return load_yaml(str(config_path)).data
-def _write_jsonl(records: Iterable[dict[str, object]], destination: Path) -> None:
-    destination.parent.mkdir(parents=True, exist_ok=True)
-    with destination.open("w", encoding="utf-8") as handle:
-        for record in records:
-            handle.write(json.dumps(record, ensure_ascii=False) + "\n")
-def _emotion_records(
-    dataset_split: Dataset, label_names: list[str] | None
-) -> Iterator[dict[str, object]]:
-    for item in dataset_split:
-        data = dict(item)
-        text = data.get("text", "")
-        label_value = data.get("label")
-        def resolve_label(index: object) -> str:
-            if isinstance(index, int) and label_names and 0 <= index < len(label_names):
-                return label_names[index]
-            return str(index)
-        if isinstance(label_value, list):
-            labels = [resolve_label(idx) for idx in label_value]
         else:
-            labels = [resolve_label(label_value)]
-        yield {"text": text, "emotions": labels}
-def _topic_records(
-    dataset_split: Dataset, label_names: list[str] | None
-) -> Iterator[dict[str, object]]:
-    for item in dataset_split:
-        data = dict(item)
-        text = data.get("text") or data.get("content") or ""
-        label_value = data.get("label")
-        def resolve_topic(raw: object) -> str:
-            if label_names:
-                idx: int | None = None
-                if isinstance(raw, int):
-                    idx = raw
-                elif isinstance(raw, str):
-                    try:
-                        idx = int(raw)
-                    except ValueError:
-                        idx = None
-                if idx is not None and 0 <= idx < len(label_names):
-                    return label_names[idx]
-            return str(raw) if raw is not None else ""
-        if isinstance(label_value, list):
-            topic = resolve_topic(label_value[0]) if label_value else ""
         else:
-            topic = resolve_topic(label_value)
-        yield {"text": text, "topic": topic}
-def main() -> None:
-    args = parse_args()
-    config = _safe_load_config(args.config)
-    raw_paths = config.get("raw", {}) if isinstance(config, dict) else {}
-    downloads_cfg = config.get("downloads", {}) if isinstance(config, dict) else {}
-    summarization_cfg = (
-        downloads_cfg.get("summarization", {}) if isinstance(downloads_cfg, dict) else {}
-    )
-    summarization_dataset = summarization_cfg.get("dataset", DEFAULT_SUMMARIZATION_DATASET)
-    summarization_output = summarization_cfg.get(
-        "output", raw_paths.get("summarization", "data/raw/summarization")
-    )
-    if not args.skip_kaggle and summarization_dataset:
-        print(
-            f"Downloading summarization dataset '{summarization_dataset}' -> {summarization_output}"
-        )
-        kaggle_download(summarization_dataset, summarization_output)
-    else:
-        print("Skipping Kaggle summarization download.")
-    books_root = Path(raw_paths.get("books", "data/raw/books"))
-    books_root.mkdir(parents=True, exist_ok=True)
-    books_entries: list[dict[str, object]] = []
-    if isinstance(downloads_cfg, dict):
-        raw_entries = downloads_cfg.get("books")
-        if isinstance(raw_entries, list):
-            books_entries = [entry for entry in raw_entries if isinstance(entry, dict)]
-    if not args.skip_book:
-        if not books_entries:
-            books_entries = [
-                {
-                    "name": "pride_and_prejudice",
-                    "url": DEFAULT_BOOK_URL,
-                    "output": DEFAULT_BOOK_OUTPUT,
-                }
-            ]
-        for entry in books_entries:
-            name = str(entry.get("name") or "gutenberg_text")
-            url = str(entry.get("url") or DEFAULT_BOOK_URL)
-            output_value = entry.get("output")
-            destination = (
-                Path(output_value)
-                if isinstance(output_value, str) and output_value
-                else books_root / f"{name}.txt"
-            )
-            destination.parent.mkdir(parents=True, exist_ok=True)
-            print(f"Downloading Gutenberg text '{name}' from {url} -> {destination}")
-            gutenberg_download(url, str(destination))
-    else:
-        print("Skipping Gutenberg downloads.")
-    emotion_cfg = downloads_cfg.get("emotion", {}) if isinstance(downloads_cfg, dict) else {}
-    emotion_name = emotion_cfg.get("dataset", DEFAULT_EMOTION_DATASET)
-    emotion_dir = Path(raw_paths.get("emotion", "data/raw/emotion"))
-    emotion_dir.mkdir(parents=True, exist_ok=True)
-    print(f"Downloading emotion dataset '{emotion_name}' -> {emotion_dir}")
-    emotion_dataset = cast(DatasetDict, load_dataset(emotion_name))
-    first_emotion_key = next(iter(emotion_dataset.keys()), None) if emotion_dataset else None
-    emotion_label_feature = (
-        emotion_dataset[first_emotion_key].features.get("label")
-        if first_emotion_key is not None
-        else None
-    )
-    emotion_label_names = (
-        emotion_label_feature.names if isinstance(emotion_label_feature, ClassLabel) else None
-    )
-    for split_name, split in emotion_dataset.items():
-        output_path = emotion_dir / f"{str(split_name)}.jsonl"
-        _write_jsonl(_emotion_records(split, emotion_label_names), output_path)
-    topic_cfg = downloads_cfg.get("topic", {}) if isinstance(downloads_cfg, dict) else {}
-    topic_name = topic_cfg.get("dataset", DEFAULT_TOPIC_DATASET)
-    topic_dir = Path(raw_paths.get("topic", "data/raw/topic"))
-    topic_dir.mkdir(parents=True, exist_ok=True)
-    print(f"Downloading topic dataset '{topic_name}' -> {topic_dir}")
-    topic_dataset = cast(DatasetDict, load_dataset(topic_name))
-    first_topic_key = next(iter(topic_dataset.keys()), None) if topic_dataset else None
-    topic_label_feature = (
-        topic_dataset[first_topic_key].features.get("label")
-        if first_topic_key is not None
-        else None
     )
-    topic_label_names = (
-        topic_label_feature.names if isinstance(topic_label_feature, ClassLabel) else None
     )
-    for split_name, split in topic_dataset.items():
-        output_path = topic_dir / f"{str(split_name)}.jsonl"
-        _write_jsonl(_topic_records(split, topic_label_names), output_path)
-    print("Download routine finished.")
 if __name__ == "__main__":

 """
 Dataset download script for LexiMind.
+Downloads training datasets from HuggingFace Hub and Project Gutenberg:
+- GoEmotions: 28 emotion labels (43K samples)
+- Yahoo Answers: 10 topic labels (1.4M samples, subset to 200K)
+- CNN/DailyMail + BookSum: Summarization (100K + 9.6K samples)
+- Gutenberg: Classic books for inference demos
 Author: Oliver Perrin
 Date: December 2025
 import argparse
 import json
+import random
 import socket
 import sys
 from pathlib import Path
+from typing import Any, cast
 from urllib.error import URLError
 from urllib.request import urlopen
+from datasets import ClassLabel, DatasetDict, load_dataset
+from datasets import Sequence as DatasetSequence
+from tqdm import tqdm
 PROJECT_ROOT = Path(__file__).resolve().parents[1]
 if str(PROJECT_ROOT) not in sys.path:
 from src.utils.config import load_yaml
 DOWNLOAD_TIMEOUT = 60
+# --------------- Label Definitions ---------------
+EMOTION_LABELS = [
+    "admiration",
+    "amusement",
+    "anger",
+    "annoyance",
+    "approval",
+    "caring",
+    "confusion",
+    "curiosity",
+    "desire",
+    "disappointment",
+    "disapproval",
+    "disgust",
+    "embarrassment",
+    "excitement",
+    "fear",
+    "gratitude",
+    "grief",
+    "joy",
+    "love",
+    "nervousness",
+    "optimism",
+    "pride",
+    "realization",
+    "relief",
+    "remorse",
+    "sadness",
+    "surprise",
+    "neutral",
+]
+TOPIC_LABELS = [
+    "Society & Culture",
+    "Science & Mathematics",
+    "Health",
+    "Education & Reference",
+    "Computers & Internet",
+    "Sports",
+    "Business & Finance",
+    "Entertainment & Music",
+    "Family & Relationships",
+    "Politics & Government",
+]
+# --------------- Utility Functions ---------------
+def _write_jsonl(records: list[dict], destination: Path, desc: str = "Writing") -> None:
+    """Write records to JSONL file with progress bar."""
+    destination.parent.mkdir(parents=True, exist_ok=True)
+    with destination.open("w", encoding="utf-8") as f:
+        for record in tqdm(records, desc=desc, leave=False):
+            f.write(json.dumps(record, ensure_ascii=False) + "\n")
 def gutenberg_download(url: str, output_path: str) -> None:
+    """Download a text file from Project Gutenberg."""
     target = Path(output_path)
     target.parent.mkdir(parents=True, exist_ok=True)
     try:
+        with urlopen(url, timeout=DOWNLOAD_TIMEOUT) as response:
+            content = response.read()
+            target.write_bytes(content)
+    except (URLError, socket.timeout, OSError) as e:
+        raise RuntimeError(f"Failed to download '{url}': {e}") from e
+# --------------- Emotion Dataset (GoEmotions) ---------------
+def download_emotion_dataset(output_dir: Path, config: dict) -> None:
+    """Download GoEmotions dataset with 28 emotion labels."""
+    print("\n�� Downloading GoEmotions (28 emotions)...")
+    dataset_name = config.get("dataset", "google-research-datasets/go_emotions")
+    dataset_config = config.get("config", "simplified")
+    ds = cast(DatasetDict, load_dataset(dataset_name, dataset_config))
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Get label names from dataset
+    label_feature = ds["train"].features.get("labels")
+    inner_feature = getattr(label_feature, "feature", None)
+    if isinstance(label_feature, DatasetSequence) and isinstance(inner_feature, ClassLabel):
+        label_names = cast(list[str], inner_feature.names)
+    else:
+        label_names = EMOTION_LABELS
+    for split_name, split in ds.items():
+        records = []
+        for item in tqdm(split, desc=f"Processing {split_name}", leave=False):
+            row = cast(dict[str, Any], item)
+            text = row.get("text", "")
+            label_indices = row.get("labels", [])
+            # Convert indices to label names
+            emotions = [label_names[i] for i in label_indices if 0 <= i < len(label_names)]
+            if text and emotions:
+                records.append({"text": text, "emotions": emotions})
+        output_path = output_dir / f"{split_name}.jsonl"
+        _write_jsonl(records, output_path, f"Writing {split_name}")
+        print(f"   ✓ {split_name}: {len(records):,} samples -> {output_path}")
+    # Save label names
+    labels_path = output_dir / "labels.json"
+    labels_path.write_text(json.dumps(label_names, indent=2))
+    print(f"   ✓ Labels ({len(label_names)}): {labels_path}")
+# --------------- Topic Dataset (Yahoo Answers) ---------------
+def download_topic_dataset(output_dir: Path, config: dict) -> None:
+    """Download Yahoo Answers dataset with 10 topic labels."""
+    print("\n📥 Downloading Yahoo Answers (10 topics)...")
+    dataset_name = config.get("dataset", "yahoo_answers_topics")
+    max_samples = config.get("max_samples", 200000)
+    ds = cast(DatasetDict, load_dataset(dataset_name))
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Get label names
+    label_feature = ds["train"].features.get("topic")
+    if isinstance(label_feature, ClassLabel):
+        label_names = label_feature.names
+    else:
+        label_names = TOPIC_LABELS
+    for split_name, split in ds.items():
+        # Determine sample limit for this split
+        if split_name == "train":
+            limit = max_samples
         else:
+            limit = min(len(split), max_samples // 10)
+        # Random sample if needed
+        indices = list(range(len(split)))
+        if len(indices) > limit:
+            random.seed(42)
+            indices = random.sample(indices, limit)
+        records = []
+        for idx in tqdm(indices, desc=f"Processing {split_name}", leave=False):
+            item = cast(dict[str, Any], split[idx])
+            # Combine question and best answer for richer text
+            question = item.get("question_title", "") + " " + item.get("question_content", "")
+            answer = item.get("best_answer", "")
+            text = (question + " " + answer).strip()
+            topic_idx = item.get("topic", 0)
+            topic = label_names[topic_idx] if 0 <= topic_idx < len(label_names) else str(topic_idx)
+            if text and len(text) > 50:  # Filter very short texts
+                records.append({"text": text, "topic": topic})
+        output_path = output_dir / f"{split_name}.jsonl"
+        _write_jsonl(records, output_path, f"Writing {split_name}")
+        print(f"   ✓ {split_name}: {len(records):,} samples -> {output_path}")
+    # Save label names
+    labels_path = output_dir / "labels.json"
+    labels_path.write_text(json.dumps(label_names, indent=2))
+    print(f"   ✓ Labels ({len(label_names)}): {labels_path}")
+# --------------- Summarization Dataset (CNN/DailyMail + BookSum) ---------------
+def download_summarization_datasets(output_dir: Path, config: list[dict]) -> None:
+    """Download summarization datasets (CNN/DailyMail and BookSum)."""
+    print("\n📥 Downloading Summarization datasets...")
+    output_dir.mkdir(parents=True, exist_ok=True)
+    all_train, all_val, all_test = [], [], []
+    for ds_config in config:
+        name = ds_config.get("name", "unknown")
+        dataset_name = ds_config.get("dataset")
+        dataset_config = ds_config.get("config")
+        source_field = ds_config.get("source_field", "article")
+        target_field = ds_config.get("target_field", "highlights")
+        max_samples = ds_config.get("max_samples")
+        print(f"\n   Loading {name}...")
+        if not dataset_name:
+            print(f"      ✗ Skipping {name}: no dataset specified")
+            continue
+        if dataset_config:
+            ds = cast(DatasetDict, load_dataset(str(dataset_name), str(dataset_config)))
         else:
+            ds = cast(DatasetDict, load_dataset(str(dataset_name)))
+        for split_name, split in ds.items():
+            split_str = str(split_name)
+            # Determine limit
+            limit = max_samples if max_samples else len(split)
+            if split_str != "train":
+                limit = min(len(split), limit // 10)
+            indices = list(range(min(len(split), limit)))
+            records = []
+            for idx in tqdm(indices, desc=f"{name}/{split_str}", leave=False):
+                item = cast(dict[str, Any], split[idx])
+                source = item.get(source_field, "")
+                target = item.get(target_field, "")
+                if source and target and len(str(source)) > 100:
+                    records.append({"source": source, "summary": target})
+            # Route to appropriate split
+            if "train" in split_str:
+                all_train.extend(records)
+            elif "val" in split_str or "validation" in split_str:
+                all_val.extend(records)
+            else:
+                all_test.extend(records)
+            print(f"      ✓ {split_name}: {len(records):,} samples")
+    # Write combined files
+    if all_train:
+        _write_jsonl(all_train, output_dir / "train.jsonl", "Writing train")
+        print(f"   ✓ Combined train: {len(all_train):,} samples")
+    if all_val:
+        _write_jsonl(all_val, output_dir / "validation.jsonl", "Writing validation")
+        print(f"   ✓ Combined validation: {len(all_val):,} samples")
+    if all_test:
+        _write_jsonl(all_test, output_dir / "test.jsonl", "Writing test")
+        print(f"   ✓ Combined test: {len(all_test):,} samples")
+# --------------- Book Downloads (Gutenberg) ---------------
+def download_books(books_dir: Path, config: list[dict]) -> None:
+    """Download classic books from Project Gutenberg."""
+    print("\n📥 Downloading Gutenberg books...")
+    books_dir.mkdir(parents=True, exist_ok=True)
+    for book in config:
+        name = book.get("name", "unknown")
+        url = book.get("url")
+        output = book.get("output", str(books_dir / f"{name}.txt"))
+        if not url:
+            continue
+        output_path = Path(output)
+        if output_path.exists():
+            print(f"   ✓ {name}: already exists")
+            continue
+        try:
+            print(f"   ⏳ {name}: downloading...")
+            gutenberg_download(url, str(output_path))
+            print(f"   ✓ {name}: {output_path}")
+        except Exception as e:
+            print(f"   ✗ {name}: {e}")
+# --------------- Main Entry Point ---------------
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Download LexiMind training datasets")
+    parser.add_argument(
+        "--config", default="configs/data/datasets.yaml", help="Dataset config path"
     )
+    parser.add_argument(
+        "--skip-summarization", action="store_true", help="Skip summarization datasets"
     )
+    parser.add_argument("--skip-emotion", action="store_true", help="Skip emotion dataset")
+    parser.add_argument("--skip-topic", action="store_true", help="Skip topic dataset")
+    parser.add_argument("--skip-books", action="store_true", help="Skip Gutenberg books")
+    return parser.parse_args()
+def main() -> None:
+    args = parse_args()
+    # Load config
+    config_path = Path(args.config)
+    if not config_path.exists():
+        print(f"Config not found: {config_path}")
+        sys.exit(1)
+    config = load_yaml(str(config_path)).data
+    raw_paths = config.get("raw", {})
+    downloads = config.get("downloads", {})
+    print("=" * 60)
+    print("LexiMind Dataset Download")
+    print("=" * 60)
+    # Download emotion dataset
+    if not args.skip_emotion:
+        emotion_config = downloads.get("emotion", {})
+        emotion_dir = Path(raw_paths.get("emotion", "data/raw/emotion"))
+        download_emotion_dataset(emotion_dir, emotion_config)
+    # Download topic dataset
+    if not args.skip_topic:
+        topic_config = downloads.get("topic", {})
+        topic_dir = Path(raw_paths.get("topic", "data/raw/topic"))
+        download_topic_dataset(topic_dir, topic_config)
+    # Download summarization datasets
+    if not args.skip_summarization:
+        summ_config = downloads.get("summarization", [])
+        if isinstance(summ_config, list):
+            summ_dir = Path(raw_paths.get("summarization", "data/raw/summarization"))
+            download_summarization_datasets(summ_dir, summ_config)
+    # Download books
+    if not args.skip_books:
+        books_config = downloads.get("books", [])
+        if isinstance(books_config, list):
+            books_dir = Path(raw_paths.get("books", "data/raw/books"))
+            download_books(books_dir, books_config)
+    print("\n" + "=" * 60)
+    print("✅ Download complete!")
+    print("=" * 60)
 if __name__ == "__main__":

scripts/preprocess_data.py CHANGED Viewed

@@ -141,24 +141,42 @@ def preprocess_summarization(raw_dir: Path, processed_dir: Path) -> None:
         return
     for split in ("train", "validation", "test"):
-        source_path = _resolve_csv(raw_dir, f"{split}.csv")
-        if source_path is None:
             print(f"Skipping summarization split '{split}' (file not found)")
             continue
         output_path = processed_dir / f"{split}.jsonl"
         output_path.parent.mkdir(parents=True, exist_ok=True)
         print(f"Writing summarization split '{split}' to {output_path}")
-        with (
-            source_path.open("r", encoding="utf-8", newline="") as source_handle,
-            output_path.open("w", encoding="utf-8") as sink,
-        ):
-            reader = csv.DictReader(source_handle)
-            for row in reader:
-                article = row.get("article") or row.get("Article") or ""
-                highlights = row.get("highlights") or row.get("summary") or ""
-                payload = {"source": article.strip(), "summary": highlights.strip()}
-                sink.write(json.dumps(payload, ensure_ascii=False) + "\n")
 def preprocess_emotion(raw_dir: Path, processed_dir: Path, cleaner: BasicTextCleaner) -> None:

         return
     for split in ("train", "validation", "test"):
+        # Check for JSONL first (from new download script), then CSV (legacy)
+        jsonl_path = raw_dir / f"{split}.jsonl"
+        csv_path = _resolve_csv(raw_dir, f"{split}.csv")
+        if jsonl_path.exists():
+            source_path = jsonl_path
+            is_jsonl = True
+        elif csv_path is not None:
+            source_path = csv_path
+            is_jsonl = False
+        else:
             print(f"Skipping summarization split '{split}' (file not found)")
             continue
         output_path = processed_dir / f"{split}.jsonl"
         output_path.parent.mkdir(parents=True, exist_ok=True)
         print(f"Writing summarization split '{split}' to {output_path}")
+        with output_path.open("w", encoding="utf-8") as sink:
+            if is_jsonl:
+                # Process JSONL format (from new download script)
+                for row in _read_jsonl(source_path):
+                    source = str(row.get("source") or row.get("article") or "")
+                    summary = str(row.get("summary") or row.get("highlights") or "")
+                    if source and summary:
+                        payload = {"source": source.strip(), "summary": summary.strip()}
+                        sink.write(json.dumps(payload, ensure_ascii=False) + "\n")
+            else:
+                # Process CSV format (legacy)
+                with source_path.open("r", encoding="utf-8", newline="") as source_handle:
+                    reader = csv.DictReader(source_handle)
+                    for row in reader:
+                        article = str(row.get("article") or row.get("Article") or "")
+                        highlights = str(row.get("highlights") or row.get("summary") or "")
+                        payload = {"source": article.strip(), "summary": highlights.strip()}
+                        sink.write(json.dumps(payload, ensure_ascii=False) + "\n")
 def preprocess_emotion(raw_dir: Path, processed_dir: Path, cleaner: BasicTextCleaner) -> None: