Создание этически ориентированных автономных агентов на локальных open-source моделях

В этом руководстве показано, как разработать автономного агента, который совмещает достижение целей с соблюдением этических и организационных ценностей, используя открытые модели Hugging Face локально в Colab. Реализация отделяет модель политики, предлагающую действия, и модель этического обзора, оценивающую и выравнивающую их, что позволяет проводить принятие решений, ориентированных на ценности, без внешних API.

Установка и вспомогательные функции

Начните с установки пакетов и определения вспомогательных функций для sequence-to-sequence и causal генерации. Эти функции позволяют системе выдавать как рассуждения, так и творческие варианты действий.

!pip install -q transformers torch accelerate sentencepiece
 
 
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
 
 
def generate_seq2seq(model, tokenizer, prompt, max_new_tokens=128):
   inputs = tokenizer(prompt, return_tensors="pt")
   with torch.no_grad():
       output_ids = model.generate(
           **inputs,
           max_new_tokens=max_new_tokens,
           do_sample=True,
           top_p=0.9,
           temperature=0.7,
           pad_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id,
       )
   return tokenizer.decode(output_ids[0], skip_special_tokens=True)
 
 
def generate_causal(model, tokenizer, prompt, max_new_tokens=128):
   inputs = tokenizer(prompt, return_tensors="pt")
   with torch.no_grad():
       output_ids = model.generate(
           **inputs,
           max_new_tokens=max_new_tokens,
           do_sample=True,
           top_p=0.9,
           temperature=0.7,
           pad_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id,
       )
   full_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
   return full_text[len(prompt):].strip()

Модели: политика и этический рецензент

Загрузите небольшую causal модель для генерации действий и seq2seq модель для оценки этики и выравнивания. Перенесите модели на CPU или GPU и убедитесь, что у токенизаторов есть pad-токены.

policy_model_name = "distilgpt2"
judge_model_name = "google/flan-t5-small"
 
 
policy_tokenizer = AutoTokenizer.from_pretrained(policy_model_name)
policy_model = AutoModelForCausalLM.from_pretrained(policy_model_name)
 
 
judge_tokenizer = AutoTokenizer.from_pretrained(judge_model_name)
judge_model = AutoModelForSeq2SeqLM.from_pretrained(judge_model_name)
 
 
device = "cuda" if torch.cuda.is_available() else "cpu"
policy_model = policy_model.to(device)
judge_model = judge_model.to(device)
 
 
if policy_tokenizer.pad_token is None:
   policy_tokenizer.pad_token = policy_tokenizer.eos_token
if judge_tokenizer.pad_token is None:
   judge_tokenizer.pad_token = judge_tokenizer.eos_token

Архитектура агента: предложение, оценка, выравнивание

Класс EthicalAgent разделяет логику на генерацию кандидатов, этическую оценку и переписывание для соответствия ценностям. Модель политики генерирует кандидатов, рецензент оценивает риски и проблемы, а модуль выравнивания корректирует действие при необходимости.

class EthicalAgent:
   def __init__(self, policy_model, policy_tok, judge_model, judge_tok):
       self.policy_model = policy_model
       self.policy_tok = policy_tok
       self.judge_model = judge_model
       self.judge_tok = judge_tok
 
 
   def propose_actions(self, user_goal, context, n_candidates=3):
       base_prompt = (
           "You are an autonomous operations agent. "
           "Given the goal and context, list a specific next action you will take:\n\n"
           f"Goal: {user_goal}\nContext: {context}\nAction:"
       )
       candidates = []
       for _ in range(n_candidates):
           action = generate_causal(self.policy_model, self.policy_tok, base_prompt, max_new_tokens=40)
           action = action.split("\n")[0]
           candidates.append(action.strip())
       return list(dict.fromkeys(candidates))
 
 
   def judge_action(self, action, org_values):
       judge_prompt = (
           "You are the Ethics & Compliance Reviewer.\n"
           "Evaluate the proposed agent action.\n"
           "Return fields:\n"
           "RiskLevel (LOW/MED/HIGH),\n"
           "Issues (short bullet-style text),\n"
           "Recommendation (approve / modify / reject).\n\n"
           f"ORG_VALUES:\n{org_values}\n\n"
           f"ACTION:\n{action}\n\n"
           "Answer in this format:\n"
           "RiskLevel: ...\nIssues: ...\nRecommendation: ..."
       )
       verdict = generate_seq2seq(self.judge_model, self.judge_tok, judge_prompt, max_new_tokens=128)
       return verdict.strip()
 
 
   def align_action(self, action, verdict, org_values):
       align_prompt = (
           "You are an Ethics Alignment Assistant.\n"
           "Your job is to FIX the proposed action so it follows ORG_VALUES.\n"
           "Keep it effective but safe, legal, and respectful.\n\n"
           f"ORG_VALUES:\n{org_values}\n\n"
           f"ORIGINAL_ACTION:\n{action}\n\n"
           f"VERDICT_FROM_REVIEWER:\n{verdict}\n\n"
           "Rewrite ONLY IF NEEDED. If original is fine, return it unchanged. "
           "Return just the final aligned action:"
       )
       aligned = generate_seq2seq(self.judge_model, self.judge_tok, align_prompt, max_new_tokens=128)
       return aligned.strip()

   def decide(self, user_goal, context, org_values, n_candidates=3):
       proposals = self.propose_actions(user_goal, context, n_candidates=n_candidates)
       scored = []
       for act in proposals:
           verdict = self.judge_action(act, org_values)
           aligned_act = self.align_action(act, verdict, org_values)
           scored.append({"original_action": act, "review": verdict, "aligned_action": aligned_act})
 
 
       def extract_risk(vtext):
           for line in vtext.splitlines():
               if "RiskLevel" in line:
                   lvl = line.split(":", 1)[-1].strip().upper()
                   if "LOW" in lvl:
                       return 0
                   if "MED" in lvl:
                       return 1
                   if "HIGH" in lvl:
                       return 2
           return 3
 
 
       scored_sorted = sorted(scored, key=lambda x: extract_risk(x["review"]))
       final_choice = scored_sorted[0]
       report = {
           "goal": user_goal,
           "context": context,
           "org_values": org_values,
           "candidates_evaluated": scored,
           "final_plan": final_choice["aligned_action"],
           "final_plan_rationale": final_choice["review"],
       }
       return report

Демонстрация и отчет

Определите организационные ценности и сценарий, запустите агента и распечатайте понятный отчет с кандидатами, этическими отзывами, выровненными действиями и окончательным планом.

org_values_text = (
   "- Respect privacy; do not access personal data without consent.\n"
   "- Follow all laws and safety policies.\n"
   "- Avoid discrimination, harassment, or harmful manipulation.\n"
   "- Be transparent and truthful with stakeholders.\n"
   "- Prioritize user well-being and long-term trust over short-term gain."
)
 
 
demo_goal = "Increase customer adoption of the new financial product."
demo_context = (
   "The agent works for a bank outreach team. The target customers are small family businesses. "
   "Regulations require honest disclosure of risks and fees. Cold-calling minors or lying about terms is illegal."
)
 
 
agent = EthicalAgent(policy_model, policy_tokenizer, judge_model, judge_tokenizer)
report = agent.decide(demo_goal, demo_context, org_values_text, n_candidates=4)
 
 
def pretty_report(r):
   print("=== ETHICAL DECISION REPORT ===")
   print(f"Goal: {r['goal']}\n")
   print(f"Context: {r['context']}\n")
   print("Org Values:")
   print(r["org_values"])
   print("\n--- Candidate Evaluations ---")
   for i, cand in enumerate(r["candidates_evaluated"], 1):
       print(f"\nCandidate {i}:")
       print("Original Action:")
       print(" ", cand["original_action"])
       print("Ethics Review:")
       print(cand["review"])
       print("Aligned Action:")
       print(" ", cand["aligned_action"])
   print("\n--- Final Plan Selected ---")
   print(r["final_plan"])
   print("\nWhy this plan is acceptable (review snippet):")
   print(r["final_plan_rationale"])
 
 
pretty_report(report)

В результате вы получаете рабочую систему, где агент не только выбирает, что делать, но и оценивает, стоит ли это делать, корректирует свои решения и выравнивает поведение под человеческие и организационные принципы. Это практический пример встраивания ценностного выравнивания в агентные системы.

Создание этически ориентированных автономных агентов на локальных open-source моделях

Установка и вспомогательные функции

Модели: политика и этический рецензент

Архитектура агента: предложение, оценка, выравнивание

Демонстрация и отчет

Switch Language