Identification of unique genes in each library. After the Blast, each library can contain more than one homolog with the reference genome, to identify the coding sequences that correspond to a unique protein in the reference genome, the unique proteins in each library will be filtered using python.
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
# Read homologous proteins from Blasp to the Smart library.
# We will identify the unique proteins in the Smart library with respect to the proteins of the reference organism.
smartHomo =pd.read_csv('SmartTrinitypep.tab', sep='\t')
df=smartHomo[['Protein']]
SMARTunq=df.drop_duplicates()
SMARTunq["subject"] = 'aa'
for row_tuple in SMARTunq.itertuples():
for row_tuple1 in smartHomo.itertuples():
if row_tuple.Protein == row_tuple1.Protein:
if row_tuple1.identity > temp:
subject=row_tuple1.subject
temp=row_tuple1.identity
SMARTunq.at[row_tuple.Index, 'subject'] = subject
SMARTunq.at[row_tuple.Index, 'identity'] = temp
ddf=SMARTunq[['subject']]
SMARTunq2=ddf.drop_duplicates()
SMARTunq2["Protein"] = 'aa'
SMARTunq2["identity"] = 0
for row_tuple in SMARTunq2.itertuples():
for row_tuple1 in SMARTunq.itertuples():
if row_tuple.subject == row_tuple1.subject:
if row_tuple1.identity > temp:
Protein=row_tuple1.Protein
SMARTunq2.at[row_tuple.Index, 'Protein'] = Protein
SMARTunq2.at[row_tuple.Index, 'identity'] = temp
SMARTunq2.to_csv("SMARTUniq.txt", sep='\t', index=False, header=True)
The same is carried out for the Dino-SL library.
# Continuation of the previous python code
# Read homologous proteins from Blasp to the Dino library.
# We will identify the unique proteins in the Dino library with respect to the proteins of the reference organism.
dinoHomo=pd.read_csv('DinoTrinitypep.tab’, sep='\t')
DINOunq=df.drop_duplicates()
DINOunq["subject"] = 'aa'
for row_tuple in DINOunq.itertuples():
for row_tuple1 in dinoHomo.itertuples():
if row_tuple.Protein == row_tuple1.Protein:
if row_tuple1.identity > temp:
subject=row_tuple1.subject
DINOunq.at[row_tuple.Index, 'subject'] = subject
DINOunq.at[row_tuple.Index, 'identity'] = temp
DINOunq2=ddf.drop_duplicates()
DINOunq2["Protein"] = 'aa'
for row_tuple in DINOunq2.itertuples():
for row_tuple1 in DINOunq.itertuples():
if row_tuple.subject == row_tuple1.subject:
if row_tuple1.identity > temp:
Protein=row_tuple1.Protein
DINOunq2.at[row_tuple.Index, 'Protein'] = Protein
DINOunq2.at[row_tuple.Index, 'identity'] = temp
DINOunq2.to_csv("DINOUniq.txt", sep='\t', index=False, header=True)
Finally, we can identify the similar proteins between the two libraries by means of a Venn diagram.
# Continuation of the previous python code
SMARTlist = SMARTunq2['subject'].tolist()
DiNOlist = DINOunq2['subject'].tolist()
venn2([set(SMARTlist), set(DiNOlist)],set_labels = ('SMART', 'DINO'),set_colors=('purple', 'skyblue'), alpha = 0.7)
plt.title("Similar genes in libraries")
#Code for filtering coding regions from each library using python's list of unique proteins
# The column of unique sequence identifiers is cut, and a temporary file is created
cut -f 2 ./Blast/SMARTUniq.txt | cut -f1 -d"." > Smartuniqtemp.txt
# We search and retrieve the sequences from the original SmartTrinity.fasta file
grep -A1 -f Smartuniqtemp.txt SmartTrinity.fasta > SmartUniq.fasta