Build a Biopython AI Agent for DNA & Protein Analysis in Colab
Overview
This tutorial shows how to build an accessible Bioinformatics AI Agent using Biopython and common Python libraries. The agent runs in Google Colab and combines sequence retrieval, molecular analysis, visualization, multiple sequence alignment, phylogenetic tree construction, motif searches, codon usage profiling, and sliding-window GC analysis into a single class-based pipeline.
Environment setup
Install essential packages and ClustalW in your Colab environment before running analyses.
!pip install biopython pandas numpy matplotlib seaborn plotly requests beautifulsoup4 scipy scikit-learn networkx
!apt-get update
!apt-get install -y clustalw
Imports and initialization
Load Biopython modules, visualization libraries, and supporting packages. Set your Entrez email before fetching sequences from NCBI.
import os
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from Bio import SeqIO, Entrez, Align, Phylo
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqUtils import gc_fraction, molecular_weight
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio.Blast import NCBIWWW, NCBIXML
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
import warnings
warnings.filterwarnings('ignore')
Entrez.email = "your_email@example.com"
The BioPythonAIAgent class
The provided class bundles sequence fetching, sample sequence creation, sequence analysis, visualization, alignment, tree building, codon usage profiling, motif scanning, GC sliding-window analysis, and comparative reporting. The full class code is included below and can be used directly inside a Colab notebook.
class BioPythonAIAgent:
def __init__(self, email="your_email@example.com"):
self.email = email
Entrez.email = email
self.sequences = {}
self.analysis_results = {}
self.alignments = {}
self.trees = {}
def fetch_sequence_from_ncbi(self, accession_id, db="nucleotide", rettype="fasta"):
try:
handle = Entrez.efetch(db=db, id=accession_id, rettype=rettype, retmode="text")
record = SeqIO.read(handle, "fasta")
handle.close()
self.sequences[accession_id] = record
return record
except Exception as e:
print(f"Error fetching sequence: {str(e)}")
return None
def create_sample_sequences(self):
covid_spike = "MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQDVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEHVNNSYECDIPIGAGICASYQTQTNSPRRARSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLNRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMSECVLGQSKRVDFCGKGYHLMSFPQSAPHGVVFLHVTYVPAQEKNFTTAPAICHDGKAHFPREGVFVSNGTHWFVTQRNFYEPQIITTDNTFVSGNCDVVIGIVNNTVYDPLQPELDSFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVAKNLNESLIDLQELGKYEQYIKWPWYIWLGFIAGLIAIVMVTIMLCCMTSCCSCLKGCCSCGSCCKFDEDDSEPVLKGVKLHYT"
human_insulin = "MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN"
e_coli_16s = "AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGTAACAGGAAGCAGCTTGCTGCTTTGCTGACGAGTGGCGGACGGGTGAGTAATGTCTGGGAAACTGCCTGATGGAGGGGGATAACTACTGGAAACGGTAGCTAATACCGCATAATGTCGCAAGACCAAAGAGGGGGACCTTCGGGCCTCTTGCCATCGGATGTGCCCAGATGGGATTAGCTAGTAGGTGGGGTAACGGCTCACCTAGGCGACGATCCCTAGCTGGTCTGAGAGGATGACCAGCCACACTGGAACTGAGACACGGTCCAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCCATGCCGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAAGGCGTTAAGGTTAATAACCTTGGCGATTGACGTTACCCGCAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTCTGTCAAGTCGGATGTGAAATCCCCGGGCTCAACCTGGGAACTGCATCTGATACTGGCAAGCTTGAGTCTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTGAAATGCGTAGAGATCTGGAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACAAAGACTGACGCTCAGGTGCGAAAGCGTGGGGAGCAAACA"
sample_sequences = [
("COVID_Spike", covid_spike, "SARS-CoV-2 Spike Protein"),
("Human_Insulin", human_insulin, "Human Insulin Precursor"),
("E_coli_16S", e_coli_16s, "E. coli 16S rRNA")
]
for seq_id, seq_str, desc in sample_sequences:
record = SeqRecord(Seq(seq_str), id=seq_id, description=desc)
self.sequences[seq_id] = record
return sample_sequences
def analyze_sequence(self, sequence_id=None, sequence=None):
if sequence_id and sequence_id in self.sequences:
seq_record = self.sequences[sequence_id]
seq = seq_record.seq
description = seq_record.description
elif sequence:
seq = Seq(sequence)
description = "Custom sequence"
else:
return None
analysis = {
'length': len(seq),
'composition': {}
}
for base in ['A', 'T', 'G', 'C']:
analysis['composition'][base] = seq.count(base)
if 'A' in analysis['composition'] and 'T' in analysis['composition']:
analysis['gc_content'] = round(gc_fraction(seq) * 100, 2)
try:
analysis['molecular_weight'] = round(molecular_weight(seq, seq_type='DNA'), 2)
except:
analysis['molecular_weight'] = len(seq) * 650
try:
if len(seq) % 3 == 0:
protein = seq.translate()
analysis['translation'] = str(protein)
analysis['stop_codons'] = protein.count('*')
if '*' not in str(protein)[:-1]:
prot_analysis = ProteinAnalysis(str(protein)[:-1])
analysis['protein_mw'] = round(prot_analysis.molecular_weight(), 2)
analysis['isoelectric_point'] = round(prot_analysis.isoelectric_point(), 2)
analysis['protein_composition'] = prot_analysis.get_amino_acids_percent()
except:
pass
key = sequence_id if sequence_id else "custom"
self.analysis_results[key] = analysis
return analysis
def visualize_composition(self, sequence_id):
if sequence_id not in self.analysis_results:
return
analysis = self.analysis_results[sequence_id]
fig = make_subplots(
rows=2, cols=2,
specs=[[{"type": "pie"}, {"type": "bar"}],
[{"colspan": 2}, None]],
subplot_titles=("Nucleotide Composition", "Base Count", "Sequence Properties")
)
labels = list(analysis['composition'].keys())
values = list(analysis['composition'].values())
fig.add_trace(
go.Pie(labels=labels, values=values, name="Composition"),
row=1, col=1
)
fig.add_trace(
go.Bar(x=labels, y=values, name="Count", marker_color=['red', 'blue', 'green', 'orange']),
row=1, col=2
)
properties = ['Length', 'GC%', 'MW (kDa)']
prop_values = [
analysis['length'],
analysis.get('gc_content', 0),
analysis.get('molecular_weight', 0) / 1000
]
fig.add_trace(
go.Scatter(x=properties, y=prop_values, mode='markers+lines',
marker=dict(size=10, color='purple'), name="Properties"),
row=2, col=1
)
fig.update_layout(
title=f"Comprehensive Analysis: {sequence_id}",
showlegend=False,
height=600
)
fig.show()
def perform_multiple_sequence_alignment(self, sequence_ids):
if len(sequence_ids) < 2:
return None
sequences = []
for seq_id in sequence_ids:
if seq_id in self.sequences:
sequences.append(self.sequences[seq_id])
if len(sequences) < 2:
return None
from Bio.Align import PairwiseAligner
aligner = PairwiseAligner()
aligner.match_score = 2
aligner.mismatch_score = -1
aligner.open_gap_score = -2
aligner.extend_gap_score = -0.5
alignments = []
for i in range(len(sequences)):
for j in range(i+1, len(sequences)):
alignment = aligner.align(sequences[i].seq, sequences[j].seq)[0]
alignments.append(alignment)
return alignments
def create_phylogenetic_tree(self, alignment_key=None, sequences=None):
if alignment_key and alignment_key in self.alignments:
alignment = self.alignments[alignment_key]
elif sequences:
records = []
for i, seq in enumerate(sequences):
record = SeqRecord(Seq(seq), id=f"seq_{i}")
records.append(record)
SeqIO.write(records, "temp.fasta", "fasta")
try:
clustalw_cline = ClustalwCommandline("clustalw2", infile="temp.fasta")
stdout, stderr = clustalw_cline()
alignment = AlignIO.read("temp.aln", "clustal")
os.remove("temp.fasta")
os.remove("temp.aln")
os.remove("temp.dnd")
except:
return None
else:
return None
calculator = DistanceCalculator('identity')
dm = calculator.get_distance(alignment)
constructor = DistanceTreeConstructor()
tree = constructor.upgma(dm)
tree_key = f"tree_{len(self.trees)}"
self.trees[tree_key] = tree
return tree
def visualize_tree(self, tree):
fig, ax = plt.subplots(figsize=(10, 6))
Phylo.draw(tree, axes=ax)
plt.title("Phylogenetic Tree")
plt.tight_layout()
plt.show()
def protein_structure_analysis(self, sequence_id):
if sequence_id not in self.sequences:
return None
seq = self.sequences[sequence_id].seq
try:
if len(seq) % 3 == 0:
protein = seq.translate()
if '*' not in str(protein)[:-1]:
prot_analysis = ProteinAnalysis(str(protein)[:-1])
structure_analysis = {
'molecular_weight': prot_analysis.molecular_weight(),
'isoelectric_point': prot_analysis.isoelectric_point(),
'amino_acid_percent': prot_analysis.get_amino_acids_percent(),
'secondary_structure': prot_analysis.secondary_structure_fraction(),
'flexibility': prot_analysis.flexibility(),
'gravy': prot_analysis.gravy()
}
return structure_analysis
except:
pass
return None
def comparative_analysis(self, sequence_ids):
results = []
for seq_id in sequence_ids:
if seq_id in self.analysis_results:
analysis = self.analysis_results[seq_id].copy()
analysis['sequence_id'] = seq_id
results.append(analysis)
df = pd.DataFrame(results)
if len(df) > 1:
fig = make_subplots(
rows=2, cols=2,
subplot_titles=("Length Comparison", "GC Content", "Molecular Weight", "Composition Heatmap")
)
fig.add_trace(
go.Bar(x=df['sequence_id'], y=df['length'], name="Length"),
row=1, col=1
)
if 'gc_content' in df.columns:
fig.add_trace(
go.Scatter(x=df['sequence_id'], y=df['gc_content'], mode='markers+lines', name="GC%"),
row=1, col=2
)
if 'molecular_weight' in df.columns:
fig.add_trace(
go.Bar(x=df['sequence_id'], y=df['molecular_weight'], name="MW"),
row=2, col=1
)
fig.update_layout(title="Comparative Sequence Analysis", height=600)
fig.show()
return df
def codon_usage_analysis(self, sequence_id):
if sequence_id not in self.sequences:
return None
seq = self.sequences[sequence_id].seq
if len(seq) % 3 != 0:
return None
codons = {}
for i in range(0, len(seq) - 2, 3):
codon = str(seq[i:i+3])
codons[codon] = codons.get(codon, 0) + 1
codon_df = pd.DataFrame(list(codons.items()), columns=['Codon', 'Count'])
codon_df = codon_df.sort_values('Count', ascending=False)
fig = px.bar(codon_df.head(20), x='Codon', y='Count',
title=f"Top 20 Codon Usage - {sequence_id}")
fig.show()
return codon_df
def motif_search(self, sequence_id, motif_pattern):
if sequence_id not in self.sequences:
return []
seq = str(self.sequences[sequence_id].seq)
positions = []
for i in range(len(seq) - len(motif_pattern) + 1):
if seq[i:i+len(motif_pattern)] == motif_pattern:
positions.append(i)
return positions
def gc_content_window(self, sequence_id, window_size=100):
if sequence_id not in self.sequences:
return None
seq = self.sequences[sequence_id].seq
gc_values = []
positions = []
for i in range(0, len(seq) - window_size + 1, window_size//4):
window = seq[i:i+window_size]
gc_values.append(gc_fraction(window) * 100)
positions.append(i + window_size//2)
fig = go.Figure()
fig.add_trace(go.Scatter(x=positions, y=gc_values, mode='lines+markers',
name=f'GC Content (window={window_size})'))
fig.update_layout(
title=f"GC Content Sliding Window Analysis - {sequence_id}",
xaxis_title="Position",
yaxis_title="GC Content (%)"
)
fig.show()
return positions, gc_values
def run_comprehensive_analysis(self, sequence_ids):
results = {}
for seq_id in sequence_ids:
if seq_id in self.sequences:
analysis = self.analyze_sequence(seq_id)
self.visualize_composition(seq_id)
gc_analysis = self.gc_content_window(seq_id)
codon_analysis = self.codon_usage_analysis(seq_id)
results[seq_id] = {
'basic_analysis': analysis,
'gc_window': gc_analysis,
'codon_usage': codon_analysis
}
if len(sequence_ids) > 1:
comparative_df = self.comparative_analysis(sequence_ids)
results['comparative'] = comparative_df
return results
Running the agent
Instantiate the agent, create sample sequences, analyze them, and run a comprehensive pipeline.
agent = BioPythonAIAgent()
sample_seqs = agent.create_sample_sequences()
for seq_id, _, _ in sample_seqs:
agent.analyze_sequence(seq_id)
results = agent.run_comprehensive_analysis(['COVID_Spike', 'Human_Insulin', 'E_coli_16S'])
print("BioPython AI Agent Tutorial Complete!")
print("Available sequences:", list(agent.sequences.keys()))
print("Available methods:", [method for method in dir(agent) if not method.startswith('_')])
Example analyses and visualization
Use visualization, motif search, codon usage, GC sliding windows, comparative tables, and phylogenetic tree construction as shown below.
agent.visualize_composition('COVID_Spike')
agent.gc_content_window('E_coli_16S', window_size=50)
agent.codon_usage_analysis('COVID_Spike')
comparative_df = agent.comparative_analysis(['COVID_Spike', 'Human_Insulin', 'E_coli_16S'])
print(comparative_df)
motif_positions = agent.motif_search('COVID_Spike', 'ATG')
print(f"ATG start codons found at positions: {motif_positions}")
tree = agent.create_phylogenetic_tree(sequences=[
str(agent.sequences['COVID_Spike'].seq[:300]),
str(agent.sequences['Human_Insulin'].seq[:300]),
str(agent.sequences['E_coli_16S'].seq[:300])
])
if tree:
agent.visualize_tree(tree)
What you can do with this agent
- Fetch sequences from NCBI or use built-in samples such as SARS-CoV-2 spike, human insulin precursor, and E. coli 16S rRNA.
- Compute nucleotide composition, GC content, estimated molecular weight, translate CDS, and compute protein properties.
- Visualize composition and comparative metrics with Plotly and Matplotlib.
- Profile codon usage, search motifs, compute sliding-window GC percentages, perform pairwise alignments, and build phylogenetic trees.
- Use the class as a starting point for educational notebooks, research prototyping, or reproducible Colab workflows.