Мультиагентный конвейер для анализа транскриптомики, протеомики и метаболомики с учетом путей
'Пошаговое руководство по мультиагентному конвейеру для генерации синтетических мультиомических данных, их статистического и сетевого анализа, обогащения путей и предсказания препаратов.'
Построение интегрированного мультиагентного конвейера для интерпретации мультиомики
В этом руководстве показано, как собрать модульную систему, объединяющую транскриптомику, протеомику и метаболомику. Каждый агент решает свою задачу: генерация синтетических данных, статистический анализ, сетевой анализ и поиск мастер-регуляторов, обогащение путей с учетом топологии, подбор препаратов для репурпозинга и формирование итогового отчета с гипотезами.
База биологических данных и начальная настройка
Система стартует с описания баз путей, взаимодействий генов и маппинга мишеней для препаратов. Эти справочные ресурсы используются во всех агентах для обхода сети, подсчета баллов для путей и оценки препаратов. Ниже — код, который задает эти структуры и импортирует библиотеки.
import numpy as np
import pandas as pd
from collections import defaultdict, deque
from dataclasses import dataclass
from typing import Dict, List, Tuple
import warnings
warnings.filterwarnings('ignore')
PATHWAY_DB = {
'Glycolysis': {'genes': ['HK2', 'PFKM', 'PKM', 'LDHA', 'GAPDH', 'ENO1'],
'metabolites': ['Glucose', 'G6P', 'F16BP', 'Pyruvate', 'Lactate'], 'score': 0},
'TCA_Cycle': {'genes': ['CS', 'IDH1', 'IDH2', 'OGDH', 'SDHA', 'MDH2'],
'metabolites': ['Citrate', 'Isocitrate', 'α-KG', 'Succinate', 'Malate'], 'score': 0},
'Oxidative_Phosphorylation': {'genes': ['NDUFA1', 'NDUFB5', 'COX5A', 'COX7A1', 'ATP5A1', 'ATP5B'],
'metabolites': ['ATP', 'ADP', 'NAD+', 'NADH'], 'score': 0},
'Fatty_Acid_Synthesis': {'genes': ['ACACA', 'FASN', 'SCD1', 'ACLY'],
'metabolites': ['Malonyl-CoA', 'Palmitate', 'Oleate'], 'score': 0},
'Fatty_Acid_Oxidation': {'genes': ['CPT1A', 'ACOX1', 'HADHA', 'ACADM'],
'metabolites': ['Acyl-CoA', 'Acetyl-CoA'], 'score': 0},
'Amino_Acid_Metabolism': {'genes': ['GOT1', 'GOT2', 'GLUD1', 'BCAT1', 'BCAT2'],
'metabolites': ['Glutamate', 'Glutamine', 'Alanine', 'Aspartate'], 'score': 0},
'Pentose_Phosphate': {'genes': ['G6PD', 'PGD', 'TKTL1'],
'metabolites': ['R5P', 'NADPH'], 'score': 0},
'Cell_Cycle_G1S': {'genes': ['CCND1', 'CDK4', 'CDK6', 'RB1', 'E2F1'], 'metabolites': [], 'score': 0},
'Cell_Cycle_G2M': {'genes': ['CCNB1', 'CDK1', 'CDC25C', 'WEE1'], 'metabolites': [], 'score': 0},
'Apoptosis': {'genes': ['BCL2', 'BAX', 'BID', 'CASP3', 'CASP8', 'CASP9'], 'metabolites': [], 'score': 0},
'mTOR_Signaling': {'genes': ['MTOR', 'RPTOR', 'RICTOR', 'AKT1', 'TSC1', 'TSC2'],
'metabolites': ['Leucine', 'ATP'], 'score': 0},
'HIF1_Signaling': {'genes': ['HIF1A', 'EPAS1', 'VEGFA', 'SLC2A1'], 'metabolites': ['Lactate'], 'score': 0},
'p53_Signaling': {'genes': ['TP53', 'MDM2', 'CDKN1A', 'BAX'], 'metabolites': [], 'score': 0},
'PI3K_AKT': {'genes': ['PIK3CA', 'AKT1', 'AKT2', 'PTEN', 'PDK1'], 'metabolites': [], 'score': 0},
}
GENE_INTERACTIONS = {
'HK2': ['PFKM', 'HIF1A', 'MTOR'], 'PFKM': ['PKM', 'HK2'], 'PKM': ['LDHA', 'HIF1A'],
'MTOR': ['AKT1', 'HIF1A', 'TSC2'], 'HIF1A': ['VEGFA', 'SLC2A1', 'PKM', 'LDHA'],
'TP53': ['MDM2', 'CDKN1A', 'BAX', 'CASP3'], 'AKT1': ['MTOR', 'TSC2', 'MDM2'],
'CCND1': ['CDK4', 'RB1'], 'CDK4': ['RB1'], 'RB1': ['E2F1'],
}
DRUG_TARGETS = {
'Metformin': ['NDUFA1'], 'Rapamycin': ['MTOR'], '2-DG': ['HK2'],
'Bevacizumab': ['VEGFA'], 'Palbociclib': ['CDK4', 'CDK6'], 'Nutlin-3': ['MDM2']
}
@dataclass
class OmicsProfile:
transcriptomics: pd.DataFrame
proteomics: pd.DataFrame
metabolomics: pd.DataFrame
metadata: DictГенерация синтетических мультиомических данных
Генератор AdvancedOmicsGenerator создает согласованные матрицы транскриптомики, протеомики и метаболомики с трендами, заданными для конкретных путей и временных точек. Белковые уровни моделируются через транскрипты с добавлением шума.
class AdvancedOmicsGenerator:
@staticmethod
def generate_coherent_omics(n_samples=30, n_timepoints=4, noise=0.2):
genes = list(set(g for p in PATHWAY_DB.values() for g in p['genes']))
metabolites = list(set(m for p in PATHWAY_DB.values() for m in p['metabolites'] if m))
proteins = [f"P_{g}" for g in genes]
n_control = n_samples // 2
samples_per_tp = n_samples // n_timepoints
trans = np.random.randn(len(genes), n_samples) * noise + 10
metab = np.random.randn(len(metabolites), n_samples) * noise + 5
for tp in range(n_timepoints):
start_idx = n_control + tp * samples_per_tp
end_idx = start_idx + samples_per_tp
progression = (tp + 1) / n_timepoints
for i, gene in enumerate(genes):
if gene in PATHWAY_DB['Glycolysis']['genes']:
trans[i, start_idx:end_idx] += np.random.uniform(1.5, 3.5) * progression
elif gene in PATHWAY_DB['Oxidative_Phosphorylation']['genes']:
trans[i, start_idx:end_idx] -= np.random.uniform(1, 2.5) * progression
elif gene in PATHWAY_DB['Cell_Cycle_G1S']['genes'] + PATHWAY_DB['Cell_Cycle_G2M']['genes']:
trans[i, start_idx:end_idx] += np.random.uniform(1, 2) * progression
elif gene in PATHWAY_DB['HIF1_Signaling']['genes']:
trans[i, start_idx:end_idx] += np.random.uniform(2, 4) * progression
elif gene in PATHWAY_DB['p53_Signaling']['genes']:
trans[i, start_idx:end_idx] -= np.random.uniform(0.5, 1.5) * progression
for i, met in enumerate(metabolites):
if met in ['Lactate', 'Pyruvate', 'G6P']:
metab[i, start_idx:end_idx] += np.random.uniform(1.5, 3) * progression
elif met in ['ATP', 'Citrate', 'Malate']:
metab[i, start_idx:end_idx] -= np.random.uniform(1, 2) * progression
elif met in ['NADPH']:
metab[i, start_idx:end_idx] += np.random.uniform(1, 2) * progression
prot = trans * 0.8 + np.random.randn(*trans.shape) * (noise * 2)
conditions = ['Control'] * n_control + [f'Disease_T{i//samples_per_tp}' for i in range(n_samples - n_control)]
trans_df = pd.DataFrame(trans, index=genes, columns=[f"S{i}_{c}" for i, c in enumerate(conditions)])
prot_df = pd.DataFrame(prot, index=proteins, columns=trans_df.columns)
metab_df = pd.DataFrame(metab, index=metabolites, columns=trans_df.columns)
metadata = {'conditions': conditions, 'n_timepoints': n_timepoints}
return OmicsProfile(trans_df, prot_df, metab_df, metadata)Статистический анализ: дифференциальный и временной
StatisticalAgent вычисляет лог2-изменения, t-статистики, приближенные p-значения и FDR и отмечает значимые изменения. Temporal analysis извлекает линейные и квадратичные тренды для каждого гена.
class StatisticalAgent:
@staticmethod
def differential_analysis(data_df, control_samples, disease_samples):
control = data_df[control_samples]
disease = data_df[disease_samples]
fc = (disease.mean(axis=1) - control.mean(axis=1))
pooled_std = np.sqrt((control.var(axis=1) + disease.var(axis=1)) / 2)
t_stat = fc / (pooled_std + 1e-6)
p_values = 2 * (1 - np.minimum(np.abs(t_stat) / (np.abs(t_stat).max() + 1e-6), 0.999))
sorted_pvals = np.sort(p_values)
ranks = np.searchsorted(sorted_pvals, p_values) + 1
fdr = p_values * len(p_values) / ranks
return pd.DataFrame({'log2FC': fc, 't_stat': t_stat, 'p_value': p_values,
'FDR': np.minimum(fdr, 1.0), 'significant': (np.abs(fc) > 1.0) & (fdr < 0.05)}).sort_values('log2FC', ascending=False)
@staticmethod
def temporal_analysis(data_df, metadata):
timepoints = metadata['n_timepoints']
samples_per_tp = data_df.shape[1] // (timepoints + 1)
trends = {}
for gene in data_df.index:
means = []
for tp in range(timepoints):
start = samples_per_tp + tp * samples_per_tp
end = start + samples_per_tp
means.append(data_df.iloc[:, start:end].loc[gene].mean())
if len(means) > 1:
x = np.arange(len(means))
coeffs = np.polyfit(x, means, deg=min(2, len(means)-1))
trends[gene] = {'slope': coeffs[0] if len(coeffs) > 1 else 0, 'trajectory': means}
return trendsСетевой анализ и причинные связи
NetworkAnalysisAgent ищет мастер-регуляторы через обход графа и пытается связать транскрипт и белок, а также гены с метаболитами внутри путей, когда есть согласованное изменение.
class NetworkAnalysisAgent:
def __init__(self, interactions):
self.graph = interactions
def find_master_regulators(self, diff_genes):
sig_genes = diff_genes[diff_genes['significant']].index.tolist()
impact_scores = {}
for gene in sig_genes:
if gene in self.graph:
downstream = self._bfs_downstream(gene, max_depth=2)
sig_downstream = [g for g in downstream if g in sig_genes]
impact_scores[gene] = {
'downstream_count': len(downstream),
'sig_downstream': len(sig_downstream),
'score': len(sig_downstream) / (len(downstream) + 1),
'fc': diff_genes.loc[gene, 'log2FC']
}
return sorted(impact_scores.items(), key=lambda x: x[1]['score'], reverse=True)
def _bfs_downstream(self, start, max_depth=2):
visited, queue = set(), deque([(start, 0)])
downstream = []
while queue:
node, depth = queue.popleft()
if depth >= max_depth or node in visited:
continue
visited.add(node)
if node in self.graph:
for neighbor in self.graph[node]:
if neighbor not in visited:
downstream.append(neighbor)
queue.append((neighbor, depth + 1))
return downstream
def causal_inference(self, diff_trans, diff_prot, diff_metab):
causal_links = []
for gene in diff_trans[diff_trans['significant']].index:
gene_fc = diff_trans.loc[gene, 'log2FC']
protein = f"P_{gene}"
if protein in diff_prot.index:
prot_fc = diff_prot.loc[protein, 'log2FC']
correlation = np.sign(gene_fc) == np.sign(prot_fc)
if correlation and abs(prot_fc) > 0.5:
causal_links.append(('transcription', gene, protein, gene_fc, prot_fc))
for pathway, content in PATHWAY_DB.items():
if gene in content['genes']:
for metab in content['metabolites']:
if metab in diff_metab.index and diff_metab.loc[metab, 'significant']:
metab_fc = diff_metab.loc[metab, 'log2FC']
causal_links.append(('enzymatic', gene, metab, gene_fc, metab_fc))
return causal_linksОбогащение путей с учетом топологии
PathwayEnrichmentAgent суммирует вклад значимых генов и метаболитов, усиливая вклад генов с высокой сетевой центральностью, чтобы получить более информативные оценки активности путей.
class PathwayEnrichmentAgent:
def __init__(self, pathway_db, interactions):
self.pathway_db = pathway_db
self.interactions = interactions
def topology_weighted_enrichment(self, diff_genes, diff_metab, network_agent):
enriched = {}
for pathway, content in self.pathway_db.items():
sig_genes = [g for g in content['genes'] if g in diff_genes.index and diff_genes.loc[g, 'significant']]
weighted_score = 0
for gene in sig_genes:
base_score = abs(diff_genes.loc[gene, 'log2FC'])
downstream = network_agent._bfs_downstream(gene, max_depth=1)
centrality = len(downstream) / 10
weighted_score += base_score * (1 + centrality)
sig_metabs = [m for m in content['metabolites'] if m in diff_metab.index and diff_metab.loc[m, 'significant']]
metab_score = sum(abs(diff_metab.loc[m, 'log2FC']) for m in sig_metabs)
total_score = (weighted_score + metab_score * 2) / max(len(content['genes']) + len(content['metabolites']), 1)
if total_score > 0.5:
enriched[pathway] = {'score': total_score, 'genes': sig_genes, 'metabolites': sig_metabs,
'gene_fc': {g: diff_genes.loc[g, 'log2FC'] for g in sig_genes},
'metab_fc': {m: diff_metab.loc[m, 'log2FC'] for m in sig_metabs},
'coherence': self._pathway_coherence(sig_genes, diff_genes)}
return enriched
def _pathway_coherence(self, genes, diff_genes):
if len(genes) < 2:
return 0
fcs = [diff_genes.loc[g, 'log2FC'] for g in genes]
same_direction = sum(1 for fc in fcs if np.sign(fc) == np.sign(fcs[0]))
return same_direction / len(genes)Репурпозинг препаратов и генерация гипотез
Агент по репурпозингу оценивает потенциальную пользу препаратов на основе значимости и центральности их мишеней. Генератор гипотез сводит внятный отчет, комбинируя временные тренды, мастер-регуляторы, обогащенные пути, причинные связи и предсказания по препаратам.
class DrugRepurposingAgent:
def __init__(self, drug_db):
self.drug_db = drug_db
def predict_drug_response(self, diff_genes, master_regulators):
predictions = []
for drug, targets in self.drug_db.items():
score = 0
affected_targets = []
for target in targets:
if target in diff_genes.index:
fc = diff_genes.loc[target, 'log2FC']
is_sig = diff_genes.loc[target, 'significant']
if is_sig:
drug_benefit = -fc if fc > 0 else 0
score += drug_benefit
affected_targets.append((target, fc))
if target in [mr[0] for mr in master_regulators[:5]]:
score += 2
if score > 0:
predictions.append({
'drug': drug,
'score': score,
'targets': affected_targets,
'mechanism': 'Inhibition of upregulated pathway'
})
return sorted(predictions, key=lambda x: x['score'], reverse=True)
class AIHypothesisEngine:
def generate_comprehensive_report(self, omics_data, analysis_results):
report = ["="*80, "ADVANCED MULTI-OMICS INTERPRETATION REPORT", "="*80, ""]
trends = analysis_results['temporal']
top_trends = sorted(trends.items(), key=lambda x: abs(x[1]['slope']), reverse=True)[:5]
report.append(" TEMPORAL DYNAMICS ANALYSIS:")
for gene, data in top_trends:
direction = "↑ Increasing" if data['slope'] > 0 else "↓ Decreasing"
report.append(f" {gene}: {direction} (slope: {data['slope']:.3f})")
report.append("\n MASTER REGULATORS (Top 5):")
for gene, data in analysis_results['master_regs'][:5]:
report.append(f" • {gene}: Controls {data['sig_downstream']} dysregulated genes (FC: {data['fc']:+.2f}, Impact: {data['score']:.3f})")
report.append("\n ENRICHED PATHWAYS:")
for pathway, data in sorted(analysis_results['pathways'].items(), key=lambda x: x[1]['score'], reverse=True):
report.append(f"\n ► {pathway} (Score: {data['score']:.3f}, Coherence: {data['coherence']:.2f})")
report.append(f" Genes: {', '.join(data['genes'][:6])}")
if data['metabolites']:
report.append(f" Metabolites: {', '.join(data['metabolites'][:4])}")
report.append("\n CAUSAL RELATIONSHIPS (Top 10):")
for link_type, source, target, fc1, fc2 in analysis_results['causal'][:10]:
report.append(f" {source} →[{link_type}]→ {target} (FC: {fc1:+.2f} → {fc2:+.2f})")
report.append("\n DRUG REPURPOSING PREDICTIONS:")
for pred in analysis_results['drugs'][:5]:
report.append(f" • {pred['drug']} (Score: {pred['score']:.2f})")
report.append(f" Targets: {', '.join([f'{t[0]}({t[1]:+.1f})' for t in pred['targets']])}")
report.append("\n AI-GENERATED BIOLOGICAL HYPOTHESES:\n")
for i, hyp in enumerate(self._generate_advanced_hypotheses(analysis_results), 1):
report.append(f"{i}. {hyp}\n")
report.append("="*80)
return "\n".join(report)
def _generate_advanced_hypotheses(self, results):
hypotheses = []
pathways = results['pathways']
if 'Glycolysis' in pathways and 'Oxidative_PhosPHORYlation' in pathways:
glyc = pathways['Glycolysis']['score']
oxphos = pathways['Oxidative_PhosPHORYlation']['score']
if glyc > oxphos * 1.5:
hypotheses.append(
"WARBURG EFFECT DETECTED: Aerobic glycolysis upregulation with oxidative phosphorylation suppression suggests metabolic reprogramming driven by HIF1A."
)
if 'Cell_Cycle_G1S' in pathways and 'mTOR_Signaling' in pathways:
hypotheses.append(
"PROLIFERATIVE SIGNATURE: Cell-cycle activation with mTOR signaling indicates anabolic reprogramming; dual CDK4/6 and mTOR inhibition may be effective."
)
if results['master_regs']:
top_mr = results['master_regs'][0]
hypotheses.append(
f"UPSTREAM REGULATOR: {top_mr[0]} controls {top_mr[1]['sig_downstream']} dysregulated genes; targeting this node can propagate network-wide correction."
)
trends = results['temporal']
progressive = [g for g, d in trends.items() if abs(d['slope']) > 0.5]
if len(progressive) > 5:
hypotheses.append(
f"PROGRESSIVE DYSREGULATION: {len(progressive)} genes show strong temporal shifts, indicating evolving pathology and benefit from early pathway intervention."
)
if 'HIF1_Signaling' in pathways:
hypotheses.append(
"HYPOXIA RESPONSE: HIF1 signaling suggests oxygen-poor microenvironment; anti-angiogenic strategies may normalize perfusion."
)
if 'p53_Signaling' in pathways:
hypotheses.append(
"TUMOR SUPPRESSOR LOSS: p53 pathway suppression suggests benefit from MDM2 inhibition if TP53 is wild-type."
)
return hypotheses if hypotheses else ["Complex multi-factorial dysregulation detected."]Оркестрация и итог
Оркестратор запускает генератор и все агенты, собирает результаты и формирует отчет. Такая структура позволяет легко адаптировать конвейер к реальным данным и расширять его новыми агентами или ресурсами путей.
Switch Language
Read this article in English