19. Data Collection for Labs
This document is a guide for preparing data used in hands-on labs 20-22. For each lab, download the required data and split it into training/bench datasets.
1. Prerequisites
pip install datasets huggingface_hub pyarrow
mkdir -p data/math/sft data/math/dpo data/finance data/reasoning
2. Math Data (for Lab 20)
2.1 SFT — Orca-Math (200K)
Microsoft's math word problem dataset. Includes step-by-step solutions.
python -c "
from datasets import load_dataset
import json
ds = load_dataset('microsoft/orca-math-word-problems-200k', split='train')
count = 0
with open('data/math/sft/math_orca_200000.jsonl', 'w', encoding='utf-8') as f:
for row in ds:
q = row.get('question') or row.get('instruction') or row.get('problem')
a = row.get('answer') or row.get('output') or row.get('response')
if q and a:
f.write(json.dumps({
'prompt': f'Solve the following math problem step by step.\n\nProblem: {q.strip()}',
'response': a.strip(),
}, ensure_ascii=False) + '\n')
count += 1
print(f'Orca-Math SFT: {count} rows saved -> data/math/sft/math_orca_200000.jsonl')
"
Split bench data (35 rows):
head -35 data/math/sft/math_orca_200000.jsonl > data/math/sft/math_orca_35.jsonl
echo "Bench: $(wc -l < data/math/sft/math_orca_35.jsonl) rows"
2.2 DPO — Math-Step-DPO (10K)
Correct/incorrect solution path pairs. Used for preference alignment after SFT.
python -c "
from datasets import load_dataset
import json
ds = load_dataset('xinlai/Math-Step-DPO-10K', split='train')
count = 0
with open('data/math/dpo/math_step_dpo_10700.jsonl', 'w', encoding='utf-8') as f:
for row in ds:
prompt = row.get('prompt', '')
chosen = row.get('full_chosen') or row.get('chosen', '')
rejected = row.get('full_rejected') or row.get('rejected', '')
if prompt and chosen and rejected:
f.write(json.dumps({
'prompt': prompt.strip(),
'chosen': chosen.strip(),
'rejected': rejected.strip(),
'dataset': row.get('dataset', ''),
'answer': str(row.get('answer', '')),
}, ensure_ascii=False) + '\n')
count += 1
print(f'Math-Step-DPO: {count} rows saved -> data/math/dpo/math_step_dpo_10700.jsonl')
"
DPO bench data (reuses SFT bench data):
# DPO bench reuses the SFT bench data (math_orca_35.jsonl)
ls -la data/math/sft/math_orca_35.jsonl
3. Korean Finance Data (for Lab 22)
Download 5 Korean finance datasets and merge them into one.
3.1 Download and Merge
python -c "
import json, ast
from pathlib import Path
from datasets import load_dataset
OUT = Path('data/finance')
OUT.mkdir(parents=True, exist_ok=True)
def clean(x):
if x is None: return ''
return str(x).strip()
def write_jsonl(path, rows):
with open(path, 'w', encoding='utf-8') as f:
for r in rows:
f.write(json.dumps(r, ensure_ascii=False) + '\n')
return len(rows)
all_rows = []
# 1) Won-Instruct (KRX)
print('[1/5] Won-Instruct...')
ds = load_dataset('KRX-Data/Won-Instruct', split='train')
for row in ds:
p = clean(row.get('prompt'))
r = clean(row.get('original_response') or row.get('response'))
if p and r:
all_rows.append({'prompt': p, 'response': r})
print(f' Won-Instruct: {len(all_rows)} rows')
# 2) financial-mmlu-ko
print('[2/5] financial-mmlu-ko...')
n0 = len(all_rows)
ds = load_dataset('allganize/financial-mmlu-ko', split='test')
for row in ds:
msgs = row.get('messages', [])
if not isinstance(msgs, list) or len(msgs) < 2: continue
parts_u, parts_a = [], []
for m in msgs:
role = clean(m.get('role','')).lower()
content = clean(m.get('content',''))
if role in ('user','human') and content: parts_u.append(content)
elif role in ('assistant','gpt') and content: parts_a.append(content)
if parts_u and parts_a:
all_rows.append({'prompt': '\n\n'.join(parts_u), 'response': '\n\n'.join(parts_a)})
print(f' financial-mmlu-ko: {len(all_rows)-n0} rows')
# 3) flare-convfinqa-ko
print('[3/5] flare-convfinqa-ko...')
n0 = len(all_rows)
ds = load_dataset('allganize/flare-convfinqa-ko', split='test')
for row in ds:
msgs = row.get('messages', [])
if not isinstance(msgs, list) or len(msgs) < 2: continue
parts_u, parts_a = [], []
for m in msgs:
role = clean(m.get('role','')).lower()
content = clean(m.get('content',''))
if role in ('user','human') and content: parts_u.append(content)
elif role in ('assistant','gpt') and content: parts_a.append(content)
if parts_u and parts_a:
all_rows.append({'prompt': '\n\n'.join(parts_u), 'response': '\n\n'.join(parts_a)})
print(f' flare-convfinqa-ko: {len(all_rows)-n0} rows')
# 4) KorfinQA
print('[4/5] KorfinQA...')
n0 = len(all_rows)
ds = load_dataset('mssongit/KorfinQA', split='train')
for row in ds:
q = clean(row.get('question'))
a = clean(row.get('answer'))
if q and a:
all_rows.append({'prompt': q, 'response': a})
print(f' KorfinQA: {len(all_rows)-n0} rows')
# 5) QA_Instruction (6 configs)
print('[5/5] QA_Instruction (6 configs)...')
n0 = len(all_rows)
for cfg in ['Multiple-Choice QA','Binary QA','Extractive QA',
'Numerical Reasoning Arithmetic','Numerical Reasoning Comparison',
'Numerical Reasoning Extraction']:
try:
ds = load_dataset('FINNUMBER/QA_Instruction', cfg, split='train')
for row in ds:
q = clean(row.get('Q'))
a = clean(row.get('A'))
c = clean(row.get('C'))
if q and a:
prompt = f'[{cfg}]\n\n[Context]\n{c}\n\n[Question]\n{q}' if c else q
all_rows.append({'prompt': prompt, 'response': a})
except Exception as e:
print(f' {cfg}: skipped ({e})')
print(f' QA_Instruction: {len(all_rows)-n0} rows')
# Save
n = write_jsonl(OUT / 'finance_ko_sft_full.jsonl', all_rows)
print(f'\nTotal: {n} rows -> data/finance/finance_ko_sft_full.jsonl')
"
3.2 Train/Bench Split
# Use the last 1000 lines as bench, the rest as training
total=$(wc -l < data/finance/finance_ko_sft_full.jsonl)
train=$((total - 1000))
head -${train} data/finance/finance_ko_sft_full.jsonl > data/finance/finance_ko_sft_179k_raw.jsonl
tail -1000 data/finance/finance_ko_sft_full.jsonl > data/finance/finance_ko_sft_1k_raw.jsonl
echo "Train: $(wc -l < data/finance/finance_ko_sft_179k_raw.jsonl) rows"
echo "Bench: $(wc -l < data/finance/finance_ko_sft_1k_raw.jsonl) rows"
4. Reasoning/CoT Data (for Lab 21)
4.1 OpenR1-Math (40K)
DeepSeek R1-style step-by-step math reasoning data. Parquet format.
python -c "
import json
from huggingface_hub import HfApi, hf_hub_download
import pyarrow.parquet as pq
repo_id = 'open-r1/OpenR1-Math-220k'
api = HfApi()
files = api.list_repo_files(repo_id=repo_id, repo_type='dataset')
parquet_files = sorted([f for f in files if f.endswith('.parquet')
and not f.startswith('extended/')])
print(f'Found {len(parquet_files)} parquet files')
MAX_ROWS = 41000 # 40k train + 1k bench
written = 0
with open('data/reasoning/openr1_math_full.jsonl', 'w', encoding='utf-8') as fout:
for repo_file in parquet_files:
local_path = hf_hub_download(repo_id=repo_id, repo_type='dataset', filename=repo_file)
pf = pq.ParquetFile(local_path)
for batch in pf.iter_batches(batch_size=2048):
for row in batch.to_pylist():
prompt = (row.get('problem') or '').strip()
response = (row.get('solution') or '').strip()
if prompt and response:
fout.write(json.dumps({'prompt': prompt, 'response': response},
ensure_ascii=False) + '\n')
written += 1
if written >= MAX_ROWS:
break
if written >= MAX_ROWS:
break
if written >= MAX_ROWS:
break
print(f'OpenR1-Math: {written} rows -> data/reasoning/openr1_math_full.jsonl')
"
4.2 Train/Bench Split
head -40000 data/reasoning/openr1_math_full.jsonl > data/reasoning/openr1_math_40k.jsonl
tail -1000 data/reasoning/openr1_math_full.jsonl > data/reasoning/openr1_math_1k.jsonl
echo "Train: $(wc -l < data/reasoning/openr1_math_40k.jsonl) rows"
echo "Bench: $(wc -l < data/reasoning/openr1_math_1k.jsonl) rows"
5. Pre-existing Data (Included by Default)
Already included data:
| File | Size | Purpose |
|---|---|---|
data/sft_50k_en_ko_raw.jsonl |
50K | General SFT (English-Korean mix) -- Lab 21 Stage 1 |
data/sft_1k_en_ko_raw.jsonl |
1K | General SFT bench -- Lab 21 Stage 1 bench |
data/sft_10k_raw.jsonl |
10K | Korean SFT |
data/dpo_10k_raw.jsonl |
10K | Korean DPO |
6. Data Structure Summary
data/
├── math/
│ ├── sft/
│ │ ├── math_orca_200000.jsonl # Orca-Math SFT (200K)
│ │ └── math_orca_35.jsonl # For bench (35 rows)
│ └── dpo/
│ └── math_step_dpo_10700.jsonl # Math-Step DPO (10.7K)
├── finance/
│ ├── finance_ko_sft_179k_raw.jsonl # For training (~179K)
│ └── finance_ko_sft_1k_raw.jsonl # For bench (1K)
├── reasoning/
│ ├── openr1_math_40k.jsonl # CoT training (40K)
│ └── openr1_math_1k.jsonl # CoT bench (1K)
├── sft_50k_en_ko_raw.jsonl # General SFT (included by default)
└── sft_1k_en_ko_raw.jsonl # General SFT bench (included by default)
All data is in EulerForge raw JSONL format:
- SFT: {"prompt": "...", "response": "..."}
- DPO: {"prompt": "...", "chosen": "...", "rejected": "..."}
Related Labs
- 20_lab_math_coding.md -- Math SFT+DPO pipeline (uses data/math/)
- 21_lab_thinking_model.md -- CoT reasoning model (uses data/reasoning/)
- 22_lab_korean_finance_copilot.md -- Korean finance copilot (uses data/finance/)