Building Ethically Aligned Autonomous Agents Locally with Open-Source Models
A practical Colab-based tutorial that demonstrates how to build an autonomous agent that generates actions, evaluates them against organizational values, and self-corrects using open-source Hugging Face models.
This tutorial shows how to design an autonomous agent that balances goal pursuit with ethical and organizational values, using open-source Hugging Face models running locally in Colab. The implementation separates a policy model that proposes actions and an ethics judge model that evaluates and aligns them, allowing value-guided, self-correcting decision-making without external APIs.
Setup and helper functions
Begin by installing packages and defining helper functions for sequence-to-sequence and causal generation. These functions let the system produce reasoning-style outputs and creative candidate actions.
!pip install -q transformers torch accelerate sentencepiece
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
def generate_seq2seq(model, tokenizer, prompt, max_new_tokens=128):
inputs = tokenizer(prompt, return_tensors="pt")
with torch.no_grad():
output_ids = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
do_sample=True,
top_p=0.9,
temperature=0.7,
pad_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id,
)
return tokenizer.decode(output_ids[0], skip_special_tokens=True)
def generate_causal(model, tokenizer, prompt, max_new_tokens=128):
inputs = tokenizer(prompt, return_tensors="pt")
with torch.no_grad():
output_ids = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
do_sample=True,
top_p=0.9,
temperature=0.7,
pad_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id,
)
full_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
return full_text[len(prompt):].strip()Models: policy and ethics judge
Load a small causal model to propose actions and a seq2seq model to act as the ethics reviewer/alignment assistant. Move models to CPU or GPU as available and make sure tokenizers have pad tokens set.
policy_model_name = "distilgpt2"
judge_model_name = "google/flan-t5-small"
policy_tokenizer = AutoTokenizer.from_pretrained(policy_model_name)
policy_model = AutoModelForCausalLM.from_pretrained(policy_model_name)
judge_tokenizer = AutoTokenizer.from_pretrained(judge_model_name)
judge_model = AutoModelForSeq2SeqLM.from_pretrained(judge_model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
policy_model = policy_model.to(device)
judge_model = judge_model.to(device)
if policy_tokenizer.pad_token is None:
policy_tokenizer.pad_token = policy_tokenizer.eos_token
if judge_tokenizer.pad_token is None:
judge_tokenizer.pad_token = judge_tokenizer.eos_tokenAgent architecture: propose, judge, align
The EthicalAgent class modularizes action proposal, ethical evaluation, and rewriting for alignment. The policy model generates candidate actions, the judge model evaluates risk and issues, and an alignment step rewrites actions when needed to follow organizational values.
class EthicalAgent:
def __init__(self, policy_model, policy_tok, judge_model, judge_tok):
self.policy_model = policy_model
self.policy_tok = policy_tok
self.judge_model = judge_model
self.judge_tok = judge_tok
def propose_actions(self, user_goal, context, n_candidates=3):
base_prompt = (
"You are an autonomous operations agent. "
"Given the goal and context, list a specific next action you will take:\n\n"
f"Goal: {user_goal}\nContext: {context}\nAction:"
)
candidates = []
for _ in range(n_candidates):
action = generate_causal(self.policy_model, self.policy_tok, base_prompt, max_new_tokens=40)
action = action.split("\n")[0]
candidates.append(action.strip())
return list(dict.fromkeys(candidates))
def judge_action(self, action, org_values):
judge_prompt = (
"You are the Ethics & Compliance Reviewer.\n"
"Evaluate the proposed agent action.\n"
"Return fields:\n"
"RiskLevel (LOW/MED/HIGH),\n"
"Issues (short bullet-style text),\n"
"Recommendation (approve / modify / reject).\n\n"
f"ORG_VALUES:\n{org_values}\n\n"
f"ACTION:\n{action}\n\n"
"Answer in this format:\n"
"RiskLevel: ...\nIssues: ...\nRecommendation: ..."
)
verdict = generate_seq2seq(self.judge_model, self.judge_tok, judge_prompt, max_new_tokens=128)
return verdict.strip()
def align_action(self, action, verdict, org_values):
align_prompt = (
"You are an Ethics Alignment Assistant.\n"
"Your job is to FIX the proposed action so it follows ORG_VALUES.\n"
"Keep it effective but safe, legal, and respectful.\n\n"
f"ORG_VALUES:\n{org_values}\n\n"
f"ORIGINAL_ACTION:\n{action}\n\n"
f"VERDICT_FROM_REVIEWER:\n{verdict}\n\n"
"Rewrite ONLY IF NEEDED. If original is fine, return it unchanged. "
"Return just the final aligned action:"
)
aligned = generate_seq2seq(self.judge_model, self.judge_tok, align_prompt, max_new_tokens=128)
return aligned.strip() def decide(self, user_goal, context, org_values, n_candidates=3):
proposals = self.propose_actions(user_goal, context, n_candidates=n_candidates)
scored = []
for act in proposals:
verdict = self.judge_action(act, org_values)
aligned_act = self.align_action(act, verdict, org_values)
scored.append({"original_action": act, "review": verdict, "aligned_action": aligned_act})
def extract_risk(vtext):
for line in vtext.splitlines():
if "RiskLevel" in line:
lvl = line.split(":", 1)[-1].strip().upper()
if "LOW" in lvl:
return 0
if "MED" in lvl:
return 1
if "HIGH" in lvl:
return 2
return 3
scored_sorted = sorted(scored, key=lambda x: extract_risk(x["review"]))
final_choice = scored_sorted[0]
report = {
"goal": user_goal,
"context": context,
"org_values": org_values,
"candidates_evaluated": scored,
"final_plan": final_choice["aligned_action"],
"final_plan_rationale": final_choice["review"],
}
return reportDemo scenario and report
Define organizational values and a demo goal/context, run the agent, and print a readable report that lists candidates, ethical reviews, aligned actions, and the final chosen plan.
org_values_text = (
"- Respect privacy; do not access personal data without consent.\n"
"- Follow all laws and safety policies.\n"
"- Avoid discrimination, harassment, or harmful manipulation.\n"
"- Be transparent and truthful with stakeholders.\n"
"- Prioritize user well-being and long-term trust over short-term gain."
)
demo_goal = "Increase customer adoption of the new financial product."
demo_context = (
"The agent works for a bank outreach team. The target customers are small family businesses. "
"Regulations require honest disclosure of risks and fees. Cold-calling minors or lying about terms is illegal."
)
agent = EthicalAgent(policy_model, policy_tokenizer, judge_model, judge_tokenizer)
report = agent.decide(demo_goal, demo_context, org_values_text, n_candidates=4)
def pretty_report(r):
print("=== ETHICAL DECISION REPORT ===")
print(f"Goal: {r['goal']}\n")
print(f"Context: {r['context']}\n")
print("Org Values:")
print(r["org_values"])
print("\n--- Candidate Evaluations ---")
for i, cand in enumerate(r["candidates_evaluated"], 1):
print(f"\nCandidate {i}:")
print("Original Action:")
print(" ", cand["original_action"])
print("Ethics Review:")
print(cand["review"])
print("Aligned Action:")
print(" ", cand["aligned_action"])
print("\n--- Final Plan Selected ---")
print(r["final_plan"])
print("\nWhy this plan is acceptable (review snippet):")
print(r["final_plan_rationale"])
pretty_report(report)This pipeline demonstrates how an agent can reason about actions, self-assess risks, request ethical review, and rewrite proposals to align with values. By separating generation, judgment, and alignment into modular steps, you can run value-guided decision-making locally on open-source models and iterate on the prompts, model choices, and scoring to fit different organizational policies.
Сменить язык
Читать эту статью на русском