Установка и зависимости

Для запуска локального GPT-подобного чат-агента понадобятся Hugging Face Transformers, PyTorch и несколько утилитных библиотек. Установите необходимые пакеты и импортируйте модули:

!pip install transformers accelerate sentencepiece --quiet
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from typing import List, Tuple, Optional
import textwrap, json, os

Настройка модели и системный промпт

Выберите лёгкую инструкционно-настроенную модель, которая понимает диалоговые промпты, и определите системный промпт для управления поведением ассистента. Установите лимит на генерируемые токены.

MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
BASE_SYSTEM_PROMPT = (
   "You are a custom GPT running locally. "
   "Follow user instructions carefully. "
   "Be concise and structured. "
   "If something is unclear, say it is unclear. "
   "Prefer practical examples over corporate examples unless explicitly asked. "
   "When asked for code, give runnable code."
)
MAX_NEW_TOKENS = 256

Загрузка модели

Загрузите токенизатор и модель из Hugging Face, автоматически настроив устройство и тип данных для использования GPU при наличии.

print("Loading model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token_id is None:
   tokenizer.pad_token_id = tokenizer.eos_token_id
model = AutoModelForCausalLM.from_pretrained(
   MODEL_NAME,
   torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
   device_map="auto"
)
model.eval()
print("Model loaded.")

Формат диалога и сборка промпта

Храните историю диалога, включая системный блок, и собирайте единый текстовый промпт с явными ролями system/user/assistant, чтобы модель корректно учитывала контекст.

ConversationHistory = List[Tuple[str, str]]
history: ConversationHistory = [("system", BASE_SYSTEM_PROMPT)]
 
 
def wrap_text(s: str, w: int = 100) -> str:
   return "\n".join(textwrap.wrap(s, width=w))
 
 
def build_chat_prompt(history: ConversationHistory, user_msg: str) -> str:
   prompt_parts = []
   for role, content in history:
       if role == "system":
           prompt_parts.append(f"<|system|>\n{content}\n")
       elif role == "user":
           prompt_parts.append(f"<|user|>\n{content}\n")
       elif role == "assistant":
           prompt_parts.append(f"<|assistant|>\n{content}\n")
   prompt_parts.append(f"<|user|>\n{user_msg}\n")
   prompt_parts.append("<|assistant|>\n")
   return "".join(prompt_parts)

Локальные инструменты и маршрутизация

Добавьте простые встроенные инструменты, которые имитируют поиск или извлечение документации. Маршрутизатор проверяет префиксы типа "search:" или "docs:" и возвращает полезный контекст для модели.

def local_tool_router(user_msg: str) -> Optional[str]:
   msg = user_msg.strip().lower()
   if msg.startswith("search:"):
       query = user_msg.split(":", 1)[-1].strip()
       return f"Search results about '{query}':\n- Key point 1\n- Key point 2\n- Key point 3"
   if msg.startswith("docs:"):
       topic = user_msg.split(":", 1)[-1].strip()
       return f"Documentation extract on '{topic}':\n1. The agent orchestrates tools.\n2. The model consumes output.\n3. Responses become memory."
   return None

Генерация ответов и хранение истории

Составьте итоговый промпт, токенизируйте и прогоните через модель. Декодируйте выход, извлеките ответ ассистента и добавьте новые записи в историю. Также доступны функции сохранения и загрузки истории.

def generate_reply(history: ConversationHistory, user_msg: str) -> str:
   tool_context = local_tool_router(user_msg)
   if tool_context:
       user_msg = user_msg + "\n\nUseful context:\n" + tool_context
   prompt = build_chat_prompt(history, user_msg)
   inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
   with torch.no_grad():
       output_ids = model.generate(
           **inputs,
           max_new_tokens=MAX_NEW_TOKENS,
           do_sample=True,
           top_p=0.9,
           temperature=0.6,
           pad_token_id=tokenizer.eos_token_id
       )
   decoded = tokenizer.decode(output_ids[0], skip_special_tokens=True)
   reply = decoded.split("<|assistant|>")[-1].strip() if "<|assistant|>" in decoded else decoded[len(prompt):].strip()
   history.append(("user", user_msg))
   history.append(("assistant", reply))
   return reply
 
 
def save_history(history: ConversationHistory, path: str = "chat_history.json") -> None:
   data = [{"role": r, "content": c} for (r, c) in history]
   with open(path, "w") as f:
       json.dump(data, f, indent=2)
 
 
def load_history(path: str = "chat_history.json") -> ConversationHistory:
   if not os.path.exists(path):
       return [("system", BASE_SYSTEM_PROMPT)]
   with open(path, "r") as f:
       data = json.load(f)
   return [(item["role"], item["content"]) for item in data]

Демонстрация и интерактив

Запустите несколько демо-запросов, чтобы проверить поведение, и при желании используйте интерактивный цикл для общения с ассистентом.

print("\n--- Demo turn 1 ---")
demo_reply_1 = generate_reply(history, "Explain what this custom GPT setup is doing in 5 bullet points.")
print(wrap_text(demo_reply_1))
 
 
print("\n--- Demo turn 2 ---")
demo_reply_2 = generate_reply(history, "search: agentic ai with local models")
print(wrap_text(demo_reply_2))
 
 
def interactive_chat():
   print("\nChat ready. Type 'exit' to stop.")
   while True:
       try:
           user_msg = input("\nUser: ").strip()
       except EOFError:
           break
       if user_msg.lower() in ("exit", "quit", "q"):
           break
       reply = generate_reply(history, user_msg)
       print("\nAssistant:\n" + wrap_text(reply))
 
 
# interactive_chat()
print("\nCustom GPT initialized successfully.")

Итог

Такой подход позволяет создать локальный интерактивный агент, который сочетает в себе промпт-оркестрацию, простую маршрутизацию инструментов и управление памятью диалога. Это даёт гибкость для экспериментов с правилами поведения, интеграциями и локальными источниками данных.

Запустите локальный GPT-подобный чат с Hugging Face Transformers