19. 대규모 학습 데이터 수집
이 문서는 20~22번 실전 과제에서 사용하는 데이터를 준비하는 가이드입니다. 각 과제별로 필요한 데이터를 다운로드하고, 훈련/벤치 데이터로 분리합니다.
1. 사전 준비
pip install datasets huggingface_hub pyarrow
mkdir -p data/math/sft data/math/dpo data/finance data/reasoning
2. 수학 데이터 (Lab 20용)
2.1 SFT — Orca-Math (200K)
Microsoft의 수학 워드 프로블럼 데이터셋. 단계별 풀이 포함.
python -c "
from datasets import load_dataset
import json
ds = load_dataset('microsoft/orca-math-word-problems-200k', split='train')
count = 0
with open('data/math/sft/math_orca_200000.jsonl', 'w', encoding='utf-8') as f:
for row in ds:
q = row.get('question') or row.get('instruction') or row.get('problem')
a = row.get('answer') or row.get('output') or row.get('response')
if q and a:
f.write(json.dumps({
'prompt': f'Solve the following math problem step by step.\n\nProblem: {q.strip()}',
'response': a.strip(),
}, ensure_ascii=False) + '\n')
count += 1
print(f'Orca-Math SFT: {count} rows saved -> data/math/sft/math_orca_200000.jsonl')
"
벤치 데이터 분리 (35 rows):
head -35 data/math/sft/math_orca_200000.jsonl > data/math/sft/math_orca_35.jsonl
echo "Bench: $(wc -l < data/math/sft/math_orca_35.jsonl) rows"
2.2 DPO — Math-Step-DPO (10K)
정답/오답 풀이 경로 쌍. SFT 이후 선호 정렬에 사용.
python -c "
from datasets import load_dataset
import json
ds = load_dataset('xinlai/Math-Step-DPO-10K', split='train')
count = 0
with open('data/math/dpo/math_step_dpo_10700.jsonl', 'w', encoding='utf-8') as f:
for row in ds:
prompt = row.get('prompt', '')
chosen = row.get('full_chosen') or row.get('chosen', '')
rejected = row.get('full_rejected') or row.get('rejected', '')
if prompt and chosen and rejected:
f.write(json.dumps({
'prompt': prompt.strip(),
'chosen': chosen.strip(),
'rejected': rejected.strip(),
'dataset': row.get('dataset', ''),
'answer': str(row.get('answer', '')),
}, ensure_ascii=False) + '\n')
count += 1
print(f'Math-Step-DPO: {count} rows saved -> data/math/dpo/math_step_dpo_10700.jsonl')
"
DPO 벤치 데이터 (SFT 벤치와 동일 사용):
# DPO 벤치에서는 SFT 벤치 데이터를 재사용 (math_orca_35.jsonl)
ls -la data/math/sft/math_orca_35.jsonl
3. 한국어 금융 데이터 (Lab 22용)
5개 한국어 금융 데이터셋을 다운로드 후 하나로 합칩니다.
3.1 데이터 다운로드 및 통합
python -c "
import json, ast
from pathlib import Path
from datasets import load_dataset
OUT = Path('data/finance')
OUT.mkdir(parents=True, exist_ok=True)
def clean(x):
if x is None: return ''
return str(x).strip()
def write_jsonl(path, rows):
with open(path, 'w', encoding='utf-8') as f:
for r in rows:
f.write(json.dumps(r, ensure_ascii=False) + '\n')
return len(rows)
all_rows = []
# 1) Won-Instruct (KRX)
print('[1/5] Won-Instruct...')
ds = load_dataset('KRX-Data/Won-Instruct', split='train')
for row in ds:
p = clean(row.get('prompt'))
r = clean(row.get('original_response') or row.get('response'))
if p and r:
all_rows.append({'prompt': p, 'response': r})
print(f' Won-Instruct: {len(all_rows)} rows')
# 2) financial-mmlu-ko
print('[2/5] financial-mmlu-ko...')
n0 = len(all_rows)
ds = load_dataset('allganize/financial-mmlu-ko', split='test')
for row in ds:
msgs = row.get('messages', [])
if not isinstance(msgs, list) or len(msgs) < 2: continue
parts_u, parts_a = [], []
for m in msgs:
role = clean(m.get('role','')).lower()
content = clean(m.get('content',''))
if role in ('user','human') and content: parts_u.append(content)
elif role in ('assistant','gpt') and content: parts_a.append(content)
if parts_u and parts_a:
all_rows.append({'prompt': '\n\n'.join(parts_u), 'response': '\n\n'.join(parts_a)})
print(f' financial-mmlu-ko: {len(all_rows)-n0} rows')
# 3) flare-convfinqa-ko
print('[3/5] flare-convfinqa-ko...')
n0 = len(all_rows)
ds = load_dataset('allganize/flare-convfinqa-ko', split='test')
for row in ds:
msgs = row.get('messages', [])
if not isinstance(msgs, list) or len(msgs) < 2: continue
parts_u, parts_a = [], []
for m in msgs:
role = clean(m.get('role','')).lower()
content = clean(m.get('content',''))
if role in ('user','human') and content: parts_u.append(content)
elif role in ('assistant','gpt') and content: parts_a.append(content)
if parts_u and parts_a:
all_rows.append({'prompt': '\n\n'.join(parts_u), 'response': '\n\n'.join(parts_a)})
print(f' flare-convfinqa-ko: {len(all_rows)-n0} rows')
# 4) KorfinQA
print('[4/5] KorfinQA...')
n0 = len(all_rows)
ds = load_dataset('mssongit/KorfinQA', split='train')
for row in ds:
q = clean(row.get('question'))
a = clean(row.get('answer'))
if q and a:
all_rows.append({'prompt': q, 'response': a})
print(f' KorfinQA: {len(all_rows)-n0} rows')
# 5) QA_Instruction (6 configs)
print('[5/5] QA_Instruction (6 configs)...')
n0 = len(all_rows)
for cfg in ['Multiple-Choice QA','Binary QA','Extractive QA',
'Numerical Reasoning Arithmetic','Numerical Reasoning Comparison',
'Numerical Reasoning Extraction']:
try:
ds = load_dataset('FINNUMBER/QA_Instruction', cfg, split='train')
for row in ds:
q = clean(row.get('Q'))
a = clean(row.get('A'))
c = clean(row.get('C'))
if q and a:
prompt = f'[{cfg}]\n\n[문맥]\n{c}\n\n[질문]\n{q}' if c else q
all_rows.append({'prompt': prompt, 'response': a})
except Exception as e:
print(f' {cfg}: skipped ({e})')
print(f' QA_Instruction: {len(all_rows)-n0} rows')
# 저장
n = write_jsonl(OUT / 'finance_ko_sft_full.jsonl', all_rows)
print(f'\nTotal: {n} rows -> data/finance/finance_ko_sft_full.jsonl')
"
3.2 훈련/벤치 분리
# 전체에서 마지막 1000줄을 벤치로, 나머지를 훈련으로
total=$(wc -l < data/finance/finance_ko_sft_full.jsonl)
train=$((total - 1000))
head -${train} data/finance/finance_ko_sft_full.jsonl > data/finance/finance_ko_sft_179k_raw.jsonl
tail -1000 data/finance/finance_ko_sft_full.jsonl > data/finance/finance_ko_sft_1k_raw.jsonl
echo "Train: $(wc -l < data/finance/finance_ko_sft_179k_raw.jsonl) rows"
echo "Bench: $(wc -l < data/finance/finance_ko_sft_1k_raw.jsonl) rows"
4. Reasoning/CoT 데이터 (Lab 21용)
4.1 OpenR1-Math (40K)
DeepSeek R1 스타일의 단계별 수학 추론 데이터. Parquet 포맷.
python -c "
import json
from huggingface_hub import HfApi, hf_hub_download
import pyarrow.parquet as pq
repo_id = 'open-r1/OpenR1-Math-220k'
api = HfApi()
files = api.list_repo_files(repo_id=repo_id, repo_type='dataset')
parquet_files = sorted([f for f in files if f.endswith('.parquet')
and not f.startswith('extended/')])
print(f'Found {len(parquet_files)} parquet files')
MAX_ROWS = 41000 # 40k train + 1k bench
written = 0
with open('data/reasoning/openr1_math_full.jsonl', 'w', encoding='utf-8') as fout:
for repo_file in parquet_files:
local_path = hf_hub_download(repo_id=repo_id, repo_type='dataset', filename=repo_file)
pf = pq.ParquetFile(local_path)
for batch in pf.iter_batches(batch_size=2048):
for row in batch.to_pylist():
prompt = (row.get('problem') or '').strip()
response = (row.get('solution') or '').strip()
if prompt and response:
fout.write(json.dumps({'prompt': prompt, 'response': response},
ensure_ascii=False) + '\n')
written += 1
if written >= MAX_ROWS:
break
if written >= MAX_ROWS:
break
if written >= MAX_ROWS:
break
print(f'OpenR1-Math: {written} rows -> data/reasoning/openr1_math_full.jsonl')
"
4.2 훈련/벤치 분리
head -40000 data/reasoning/openr1_math_full.jsonl > data/reasoning/openr1_math_40k.jsonl
tail -1000 data/reasoning/openr1_math_full.jsonl > data/reasoning/openr1_math_1k.jsonl
echo "Train: $(wc -l < data/reasoning/openr1_math_40k.jsonl) rows"
echo "Bench: $(wc -l < data/reasoning/openr1_math_1k.jsonl) rows"
5. 기존 데이터 (기본 제공)
이미 포함된 데이터:
| 파일 | 크기 | 용도 |
|---|---|---|
data/sft_50k_en_ko_raw.jsonl |
50K | 일반 SFT (영한 혼합) — Lab 21 Stage 1 |
data/sft_1k_en_ko_raw.jsonl |
1K | 일반 SFT 벤치 — Lab 21 Stage 1 벤치 |
data/sft_10k_raw.jsonl |
10K | 한국어 SFT |
data/dpo_10k_raw.jsonl |
10K | 한국어 DPO |
6. 데이터 구조 요약
data/
├── math/
│ ├── sft/
│ │ ├── math_orca_200000.jsonl # Orca-Math SFT (200K)
│ │ └── math_orca_35.jsonl # 벤치용 (35 rows)
│ └── dpo/
│ └── math_step_dpo_10700.jsonl # Math-Step DPO (10.7K)
├── finance/
│ ├── finance_ko_sft_179k_raw.jsonl # 훈련용 (~179K)
│ └── finance_ko_sft_1k_raw.jsonl # 벤치용 (1K)
├── reasoning/
│ ├── openr1_math_40k.jsonl # CoT 훈련용 (40K)
│ └── openr1_math_1k.jsonl # CoT 벤치용 (1K)
├── sft_50k_en_ko_raw.jsonl # 일반 SFT (기본 제공)
└── sft_1k_en_ko_raw.jsonl # 일반 SFT 벤치 (기본 제공)
모든 데이터는 EulerForge raw JSONL 형식:
- SFT: {"prompt": "...", "response": "..."}
- DPO: {"prompt": "...", "chosen": "...", "rejected": "..."}
관련 과제
- 20_lab_math_coding.md — 수학 SFT+DPO 파이프라인 (data/math/ 사용)
- 21_lab_thinking_model.md — CoT 추론 모델 (data/reasoning/ 사용)
- 22_lab_korean_finance_copilot.md — 한국어 금융 코파일럿 (data/finance/ 사용)