LLM 评估:指标与方法
1. 技术分析
1.1 LLM 评估维度
评估 LLM 需要考虑多个维度:
评估维度 能力评估: 语言理解、推理、生成 安全性: 对齐、偏见、有害内容 效率: 速度、内存、成本 可靠性: 一致性、稳定性
1.2 评估指标
| 维度 | 指标 | 方法 |
|---|
| 语言能力 | Perplexity | 困惑度 |
| 阅读理解 | MMLU | 多任务理解 |
| 推理能力 | GSM8K | 数学推理 |
| 生成质量 | BLEU/Rouge | 文本相似度 |
| 对齐 | MT-Bench | 多轮对话 |
1.3 评估方法
评估方法 自动评估: 指标计算 人工评估: 人类评分 对比评估: 模型对比
2. 核心功能实现
2.1 自动评估指标
import torch import math from nltk.translate.bleu_score import sentence_bleu class PerplexityCalculator: def __init__(self, model, tokenizer): self.model = model self.tokenizer = tokenizer def calculate(self, text): encodings = self.tokenizer(text, return_tensors='pt') with torch.no_grad(): outputs = self.model( input_ids=encodings.input_ids, labels=encodings.input_ids ) log_likelihood = -outputs.loss.item() perplexity = math.exp(-log_likelihood) return perplexity class BLEUScoreCalculator: def __init__(self): pass def calculate(self, reference, candidate): reference_tokens = [reference.split()] candidate_tokens = candidate.split() return sentence_bleu(reference_tokens, candidate_tokens) class RougeScoreCalculator: def __init__(self): pass def calculate(self, reference, candidate): from rouge_score import rouge_scorer scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True) scores = scorer.score(reference, candidate) return { 'rouge1': scores['rouge1'].fmeasure, 'rouge2': scores['rouge2'].fmeasure, 'rougeL': scores['rougeL'].fmeasure }
2.2 基准测试
class BenchmarkRunner: def __init__(self, model, tokenizer): self.model = model self.tokenizer = tokenizer def run_mmlu(self, dataset, k_shot=5): correct = 0 total = 0 for sample in dataset: prompt = self._build_mmlu_prompt(sample, k_shot) response = self._generate(prompt) if response.strip() == sample['answer']: correct += 1 total += 1 return correct / total def _build_mmlu_prompt(self, sample, k_shot): prompt = "请回答以下问题:\n\n" for i in range(min(k_shot, len(sample['examples']))): example = sample['examples'][i] prompt += f"问题:{example['question']}\n选项:{example['options']}\n答案:{example['answer']}\n\n" prompt += f"问题:{sample['question']}\n选项:{sample['options']}\n答案:" return prompt def run_gsm8k(self, dataset): correct = 0 total = 0 for sample in dataset: prompt = f"请解决以下数学问题:\n\n{sample['question']}\n\n解答:" response = self._generate(prompt) if self._extract_answer(response) == sample['answer']: correct += 1 total += 1 return correct / total def _generate(self, prompt): inputs = self.tokenizer(prompt, return_tensors='pt') with torch.no_grad(): outputs = self.model.generate( **inputs, max_new_tokens=100, temperature=0 ) return self.tokenizer.decode(outputs[0], skip_special_tokens=True) def _extract_answer(self, text): numbers = [int(s) for s in text.split() if s.isdigit()] return numbers[-1] if numbers else None
2.3 人类评估框架
class HumanEvaluationFramework: def __init__(self, criteria): self.criteria = criteria def evaluate(self, model_outputs, references): results = [] for output, reference in zip(model_outputs, references): scores = self._score(output, reference) results.append(scores) return self._aggregate(results) def _score(self, output, reference): scores = {} for criterion in self.criteria: scores[criterion] = self._evaluate_criterion(output, reference, criterion) return scores def _evaluate_criterion(self, output, reference, criterion): if criterion == 'accuracy': return 1.0 if output == reference else 0.0 elif criterion == 'relevance': return self._calculate_relevance(output, reference) elif criterion == 'fluency': return self._calculate_fluency(output) elif criterion == 'completeness': return self._calculate_completeness(output, reference) def _calculate_relevance(self, output, reference): output_tokens = set(output.split()) reference_tokens = set(reference.split()) if not reference_tokens: return 0.0 return len(output_tokens & reference_tokens) / len(reference_tokens) def _calculate_fluency(self, text): import language_tool_python tool = language_tool_python.LanguageTool('zh-CN') matches = tool.check(text) error_rate = len(matches) / len(text.split()) return max(0, 1 - error_rate) def _calculate_completeness(self, output, reference): output_tokens = set(output.split()) reference_tokens = set(reference.split()) if not reference_tokens: return 0.0 return len(output_tokens & reference_tokens) / len(reference_tokens) def _aggregate(self, results): aggregated = {} for criterion in self.criteria: aggregated[criterion] = sum(r[criterion] for r in results) / len(results) return aggregated
3. 性能对比
3.1 评估指标对比
| 指标 | 用途 | 优点 | 缺点 |
|---|
| Perplexity | 语言建模 | 快速 | 不反映生成质量 |
| BLEU | 机器翻译 | 标准 | 不适合开放式生成 |
| Rouge | 文本摘要 | 多维度 | 忽略语义 |
| Human Eval | 综合评估 | 准确 | 耗时 |
3.2 基准测试对比
| 基准 | 领域 | 难度 | 数据量 |
|---|
| MMLU | 多领域 | 中高 | 14K |
| GSM8K | 数学 | 中 | 8K |
| HumanEval | 代码 | 中高 | 164 |
| MT-Bench | 对话 | 中 | 80 |
3.3 模型评估结果
| 模型 | MMLU | GSM8K | HumanEval | MT-Bench |
|---|
| GPT-4 | 86.4% | 92% | 73% | 9.0 |
| Llama-2 70B | 68.7% | 56% | 48% | 6.8 |
| Mistral 7B | 60.1% | 42% | 35% | 6.2 |
4. 最佳实践
4.1 评估流程
def run_comprehensive_evaluation(model, tokenizer, config): evaluator = LMIEvaluator(model, tokenizer) results = {} if config.get('perplexity', True): results['perplexity'] = evaluator.calculate_perplexity(config['eval_text']) if config.get('benchmarks', True): results['mmlu'] = evaluator.run_mmlu(config['mmlu_dataset']) results['gsm8k'] = evaluator.run_gsm8k(config['gsm8k_dataset']) if config.get('human_eval', True): human_evaluator = HumanEvaluationFramework(['accuracy', 'relevance', 'fluency']) results['human_eval'] = human_evaluator.evaluate( config['model_outputs'], config['references'] ) return results class LMIEvaluator: def __init__(self, model, tokenizer): self.perplexity_calculator = PerplexityCalculator(model, tokenizer) self.benchmark_runner = BenchmarkRunner(model, tokenizer) def calculate_perplexity(self, text): return self.perplexity_calculator.calculate(text) def run_mmlu(self, dataset): return self.benchmark_runner.run_mmlu(dataset) def run_gsm8k(self, dataset): return self.benchmark_runner.run_gsm8k(dataset)
4.2 评估报告
class EvaluationReportGenerator: def __init__(self): pass def generate(self, results, model_name): report = f"""LLM 评估报告 模型: {model_name} 日期: {pd.Timestamp.now()} --- 1. 自动评估指标 """ if 'perplexity' in results: report += f"Perplexity: {results['perplexity']:.2f}\n" if 'mmlu' in results: report += f"MMLU: {results['mmlu']:.2%}\n" if 'gsm8k' in results: report += f"GSM8K: {results['gsm8k']:.2%}\n" if 'human_eval' in results: report += """ 2. 人类评估结果 """ for criterion, score in results['human_eval'].items(): report += f"{criterion}: {score:.2f}\n" return report
5. 总结
LLM 评估是模型选择和优化的关键:
- 自动指标:快速评估基础能力
- 基准测试:标准化对比
- 人类评估:最终验证
- 综合报告:全面了解模型能力
对比数据如下:
- GPT-4 在所有基准测试中领先
- Llama-2 70B 是最好的开源模型
- 推荐结合自动和人工评估
- MMLU 和 GSM8K 是最常用的基准测试