Бенчмарк рассуждающих агентов: практический фреймворк для Direct, CoT, ReAct и Reflexion

Системный бенчмарк методов рассуждения агентов

В этом руководстве представлен воспроизводимый фреймворк для бенчмаркинга агентных компонентов с оценкой нескольких стратегий рассуждения на разных задачах. Мы сравниваем Direct, Chain-of-Thought (CoT), ReAct и Reflexion по точности, эффективности, задержке и использованию инструментов, чтобы понять компромиссы между скоростью и глубиной рассуждений.

Реализация агентов и основные интерфейсы

Мы подготавливаем основу фреймворка, импортируем необходимые библиотеки и определяем ядро архитектур агентов. Унифицированный интерфейс BaseAgent позволяет моделировать разные поведения агентов при оценке.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Dict, Callable, Tuple
from dataclasses import dataclass
from enum import Enum
import time
from collections import defaultdict
 
 
class ReasoningStrategy(Enum):
   DIRECT = "direct"
   CHAIN_OF_THOUGHT = "chain_of_thought"
   REACT = "react"
   REFLEXION = "reflexion"
 
 
@dataclass
class AgentResponse:
   answer: str
   steps: int
   time_taken: float
   tool_calls: int
   confidence: float
 
 
class BaseAgent:
   def __init__(self, strategy: ReasoningStrategy):
       self.strategy = strategy
       self.tool_count = 0
  
   def solve(self, problem: str) -> AgentResponse:
       start_time = time.time()
       if self.strategy == ReasoningStrategy.DIRECT:
           answer, steps, tools = self._direct_solve(problem)
       elif self.strategy == ReasoningStrategy.CHAIN_OF_THOUGHT:
           answer, steps, tools = self._cot_solve(problem)
       elif self.strategy == ReasoningStrategy.REACT:
           answer, steps, tools = self._react_solve(problem)
       else:
           answer, steps, tools = self._reflexion_solve(problem)
       time_taken = time.time() - start_time
       confidence = self._calculate_confidence(problem, answer)
       return AgentResponse(answer, steps, time_taken, tools, confidence)

Поведение стратегий

Каждая стратегия смоделирована согласно типичному поведению: прямой ответ, поэтапное рассуждение CoT, ReAct с чередованием рассуждений и вызова инструментов, а также Reflexion с рефлексией и уточнением ответа. Реализации симулируют шаги, использование инструментов и оценку уверенности, чтобы метрики отражали реалистичные паттерны.

 def _direct_solve(self, problem: str) -> Tuple[str, int, int]:
       answer = self._compute_answer(problem)
       return answer, 1, 0
  
   def _cot_solve(self, problem: str) -> Tuple[str, int, int]:
       steps = 3 + len(problem.split()) // 5
       for i in range(steps):
           _ = self._reason_step(problem, i)
       answer = self._compute_answer(problem)
       return answer, steps, 0
  
   def _react_solve(self, problem: str) -> Tuple[str, int, int]:
       steps = 4
       tool_calls = 2
       for i in range(steps):
           _ = self._reason_step(problem, i)
           if i % 2 == 0:
               self._use_tool(problem)
       answer = self._compute_answer(problem)
       return answer, steps, tool_calls
  
   def _reflexion_solve(self, problem: str) -> Tuple[str, int, int]:
       steps = 6
       tool_calls = 1
       initial_answer = self._compute_answer(problem)
       reflection = self._reflect(problem, initial_answer)
       answer = self._refine(problem, initial_answer, reflection)
       return answer, steps, tool_calls
  
   def _reason_step(self, problem: str, step: int) -> str:
       return f"Analyzing aspect {step+1}"
  
   def _use_tool(self, problem: str):
       self.tool_count += 1
       time.sleep(0.001)
  
   def _compute_answer(self, problem: str) -> str:
       return f"Solution_{hash(problem) % 100}"
  
   def _reflect(self, problem: str, answer: str) -> str:
       return "Reflection on approach"
  
   def _refine(self, problem: str, answer: str, reflection: str) -> str:
       return f"Refined_{answer}"
  
   def _calculate_confidence(self, problem: str, answer: str) -> float:
       base_confidence = 0.7
       strategy_bonus = {
           ReasoningStrategy.DIRECT: 0.0,
           ReasoningStrategy.CHAIN_OF_THOUGHT: 0.1,
           ReasoningStrategy.REACT: 0.15,
           ReasoningStrategy.REFLEXION: 0.2
       }
       return min(1.0, base_confidence + strategy_bonus[self.strategy] + np.random.uniform(-0.1, 0.1))

Пакет бенчмарков

Набор задач создаётся с разными типами и уровнями сложности. Каждый BenchmarkTask возвращает метрики (accuracy, efficiency, latency, tool_efficiency), а BenchmarkSuite прогоняет агентов по задачам и собирает табличный результат для анализа.

class BenchmarkTask:
   def __init__(self, name: str, difficulty: float, ground_truth: str):
       self.name = name
       self.difficulty = difficulty
       self.ground_truth = ground_truth
  
   def evaluate(self, response: AgentResponse) -> Dict[str, float]:
       accuracy = response.confidence * (1 - self.difficulty * 0.3)
       return {
           'accuracy': accuracy,
           'efficiency': 1.0 / (response.steps + 1),
           'latency': response.time_taken,
           'tool_efficiency': 1.0 / (response.tool_calls + 1)
       }
 
 
class BenchmarkSuite:
   def __init__(self):
       self.tasks = self._create_tasks()
  
   def _create_tasks(self) -> List[BenchmarkTask]:
       tasks = []
       task_types = [
           ("Math_Problem", 0.3),
           ("Logic_Puzzle", 0.5),
           ("Code_Debug", 0.6),
           ("Complex_Reasoning", 0.8),
           ("Multi_Step_Planning", 0.7)
       ]
       for i, (task_type, difficulty) in enumerate(task_types):
           for j in range(3):
               task = BenchmarkTask(
                   name=f"{task_type}_{j+1}",
                   difficulty=difficulty + np.random.uniform(-0.1, 0.1),
                   ground_truth=f"GT_{i}_{j}"
               )
               tasks.append(task)
       return tasks
  
   def run_benchmark(self, agents: List[BaseAgent]) -> pd.DataFrame:
       results = []
       for agent in agents:
           for task in self.tasks:
               response = agent.solve(task.name)
               metrics = task.evaluate(response)
               results.append({
                   'strategy': agent.strategy.value,
                   'task': task.name,
                   'difficulty': task.difficulty,
                   'accuracy': metrics['accuracy'],
                   'efficiency': metrics['efficiency'],
                   'latency': metrics['latency'],
                   'tool_efficiency': metrics['tool_efficiency'],
                   'steps': response.steps,
                   'tool_calls': response.tool_calls
               })
       return pd.DataFrame(results)

Анализ и визуализация

Агрегация и графики показывают, как стратегии различаются по метрикам и по уровням сложности. Визуализация помогает выявить компромиссы и оценить, какие подходы дают лучшее соотношение точности и затрат ресурсов.

def analyze_results(df: pd.DataFrame):
   agg_metrics = df.groupby('strategy').agg({
       'accuracy': ['mean', 'std'],
       'efficiency': ['mean', 'std'],
       'latency': ['mean', 'std'],
       'steps': 'mean',
       'tool_calls': 'mean'
   }).round(3)
   print(agg_metrics)
  
   diff_bins = pd.cut(df['difficulty'], bins=3, labels=['Easy', 'Medium', 'Hard'])
   diff_analysis = df.groupby(['strategy', diff_bins])['accuracy'].mean().unstack()
   print(diff_analysis.round(3))
  
   tradeoff = df.groupby('strategy').agg({
       'accuracy': 'mean',
       'steps': 'mean',
       'latency': 'mean'
   })
   tradeoff['score'] = (tradeoff['accuracy'] / (tradeoff['steps'] * tradeoff['latency'])).round(3)
   print(tradeoff.round(3))
 
 
def visualize_results(df: pd.DataFrame):
   fig, axes = plt.subplots(2, 2, figsize=(14, 10))
   sns.barplot(data=df, x='strategy', y='accuracy', ax=axes[0, 0], errorbar='sd')
   axes[0, 0].set_title('Accuracy by Strategy')
   axes[0, 0].tick_params(axis='x', rotation=45)
  
   for strategy in df['strategy'].unique():
       strategy_df = df[df['strategy'] == strategy]
       axes[0, 1].scatter(strategy_df['steps'], strategy_df['accuracy'], label=strategy, alpha=0.6, s=50)
   axes[0, 1].set_title('Steps vs Accuracy')
   axes[0, 1].legend()
  
   difficulty_bins = pd.cut(df['difficulty'], bins=3, labels=['Easy', 'Medium', 'Hard'])
   df_plot = df.copy()
   df_plot['difficulty_bin'] = difficulty_bins
   sns.boxplot(data=df_plot, x='difficulty_bin', y='accuracy', hue='strategy', ax=axes[1, 0])
   axes[1, 0].set_title('Performance vs Difficulty')
  
   scores = df.groupby('strategy').apply(
       lambda x: x['accuracy'].mean() / (x['steps'].mean() * x['latency'].mean())
   ).sort_values()
   axes[1, 1].barh(range(len(scores)), scores.values)
   axes[1, 1].set_yticks(range(len(scores)))
   axes[1, 1].set_yticklabels(scores.index)
   axes[1, 1].set_title('Overall Efficiency Score')
  
   plt.tight_layout()
   plt.show()

Запуск и выводы

Прогоните набор задач для всех агентов, выполните анализ и визуализацию, чтобы получить инсайты о соотношении точности и вычислительных затрат у разных стратегий. Фреймворк помогает сравнивать, отлаживать и оптимизировать агентные поведения.