- pandas
- numpy
import pandas as pd
import numpy as np
from pyodide.http import open_url
#Read restriction enzyme data and corresponding recognition site
EnzymeFile = pd.read_csv(open_url('https://raw.githubusercontent.com/NanaAdwoaNewman/RecogSiteMod/main/RestrictionEnzymes.csv'))
res = EnzymeFile['Enzyme'].values
recog = EnzymeFile['Sequence'].values
resEnzyme = [i for i in res if i is not np.nan]
recogSite = [i for i in recog if i is not np.nan]
#AMINO ACID SEQUENCES
Ala = ["GCT", "GCC", "GCA", "GCG"] #Ala
Arg = ["CGT", "CGC", "CGA", "CGG", "AGA", "AGG"] #Arg
Asn = ["AAT", "AAC"] #Asn
Asp = ["GAT", "GAC"] #Asp
Cys = ["TGT", "TGC"] #Cys
Glu = ["GAA", "GAG"] #Glu
Gln = ["CAA", "CAG"] #Gln
Gly = ["GGT", "GGC", "GGA", "GGG"] #Gly
His = ["CAT", "CAC"] #His
Ile = ["ATT", "ATC", "ATA"] #Ile
Leu = ["TTA", "TTG", "CTT", "CTC", "CTA", "CTG"] #Leu
Lys = ["AAA","AAG"] #Lys
#Met = ["ATG"] #Met
Phe = ["TTT", "TTC"] #Phe
Pro = ["CCT", "CCC", "CCA", "CCG"] #Pro
Ser = ["TCT", "TCC", "TCA", "TCG", "AGT", "AGC"] #Ser
Thr = ["ACT", "ACC", "ACA", "ACG"] #Thr
#Trp = ["TGG"] #Trp
Tyr = ["TAT", "TAC"] #Tyr
Val = ["GTT", "GTC", "GTA", "GTG"] #Val
#aminoAcids = np.array(['A','R','N','D','C','E','Q','G','H','I','L','K','M','F','P','S','T','W','Y','V'])
aminoAcidsArray = [Ala,Arg,Asn,Asp,Cys,Glu,Gln,Gly,His,Ile,Leu,Lys,Phe,Pro,Ser,Thr,Tyr,Val]
#BASES
R = ['A', 'G'] #Purine
Y = ['T', 'C'] #Pyrimidine
W = ['A', 'T']
S = ['G', 'C']
M = ['A', 'C']
K = ['G', 'T']
H = ['A', 'T', 'C']
B = ['G', 'C', 'T']
V = ['G', 'A', 'C']
D = ['G', 'A', 'T']
N = ['G', 'A', 'T', 'C']
bases = np.array(['R','Y','W','S','M','K','H','B','V','D'])
basesArray = [R, Y, W, S, M, K, H, B, V, D]
#Check length of DNA Sequence
def CheckSequenceLen(sequence):
lenSeq = len(sequence)
msg = "DNA sequence is mutated or length of DNA sequence is invalid"
if lenSeq%3 == 0:
return "done"
else:
return msg
#Check start and end codons
def StartEndCodonCheck(sequence, direction):
if direction == "F":
startCodon = sequence[0:3].lower()
stopCodon = sequence[-3:].lower()
elif direction == "R":
startCodon = sequence[-3:][::-1].lower()
stopCodon = sequence[0:3][::-1].lower()
if stopCodon == "tag" or stopCodon == "tga" or stopCodon == "taa":
if startCodon == "atg":
return "1"
else:
return "Invalid Start Codon"
else:
return "Invalid Stop Codon"
#Checking the compatibilty of seq2modify and recogSiteCode when 'N' is present
#in recogSiteCode
def NInRecogSiteCompare(seq2modify, recogSiteCode):
x = 0
for i in recogSiteCode:
if seq2modify[x] == i or i == 'N' or i in bases:
x = x + 1
else:
return 0
break
if x == len(recogSiteCode):
return 1
def reverseStr(sequence, direction):
if direction == 'F':
sequence = sequence
elif direction == 'R':
if sequence != 'Could not modify sequence':
sequence = sequence[::-1]
return sequence
def recogSiteModification(sequence, resEnzIndex, direction):
sequence = reverseStr(sequence, direction)
resEnz = resEnzyme[resEnzIndex]
recogSiteCode = recogSite[resEnzIndex]
#print(recogSiteCode)
lenSequence = len(sequence)
lenRecogSite = len(recogSiteCode)
origSequence = sequence
Xs = []
#Loop through the gene sequence
for x in range(lenSequence - lenRecogSite + 1):
recogSiteCode = recogSite[resEnzIndex]
seq2modify = sequence[x:x+lenRecogSite].upper()
for i in range(lenRecogSite):
if recogSiteCode[i] == seq2modify[i]:
continue
else:
if recogSiteCode[i] in bases:
baseIndex = np.where(bases==recogSiteCode[i])[0][0]
for j in basesArray[baseIndex]:
if j == seq2modify[i]:
recogSiteCode = recogSiteCode[:i]+ seq2modify[i] + recogSiteCode[i+1:]
break
else:
break
#Compare the modified recognition code with the section of the sequence
if seq2modify.lower() == recogSiteCode.lower():
if lenRecogSite == 3:
if x%3 == 1:
for c in aminoAcidsArray:
if seq2modify in c:
for d in c:
if d == seq2modify[:-1]:
continue
else:
sequence = sequence[:x] + d + sequence[x+3:]
Xs.append(x)
break
else:
continue
elif lenRecogSite == 4:
#NB: * shows the start and end of the restriction enzyme recognition site
#ATG|C*AG|CT*C|TAG
if x%3 == 1:
continue
#ATG|CA*A|GCT*|TAG
elif x%3 == 2:
for a in aminoAcidsArray:
if seq2modify[1:] in a:
for b in a:
if b == seq2modify[1:]:
continue
else:
sequence = sequence[:x+1] + b + sequence[x+4:]
Xs.append(x+1)
break
else:
continue
#ATG|*AGC|T*CG|TAG
else:
for c in aminoAcidsArray:
if seq2modify[:-1] in c:
for d in c:
if d == seq2modify[:-1]:
continue
else:
sequence = sequence[:x] + d + sequence[x+3:]
Xs.append(x)
break
else:
continue
else:
#ATG|C*GG|ATC*|CGA|TAG
if x%3 == 1:
for a in aminoAcidsArray:
if seq2modify[2:5] in a:
for b in a:
if b == seq2modify[2:5]:
continue
else:
sequence = sequence[:x+2] + b + sequence[x+5:]
Xs.append(x+2)
break
else:
continue
#ATG|CA*G|GAT|C*TG|TAG
elif x%3 == 2:
for c in aminoAcidsArray:
if seq2modify[1:4] in c:
for d in c:
if d == seq2modify[1:4]:
continue
else:
sequence = sequence[:x+1] + d + sequence[x+4:]
Xs.append(x+1)
break
else:
continue
#ATG|*GGA|TC*G|CGA|TAG
elif x%3 == 0:
for e in aminoAcidsArray:
if seq2modify[:3] in e:
for f in e:
if f == seq2modify[:3]:
continue
else:
sequence = sequence[:x] + f + sequence[x+3:]
Xs.append(x)
break
else:
continue
#Special Case when there are Ns in the recognition site
elif NInRecogSiteCompare(seq2modify, recogSiteCode) == 1:
subOrigSeq = sequence
countOfNs = recogSiteCode.count('N')
firstNPos = recogSiteCode.find('N')
lastNPos = recogSiteCode.rfind('N')
for i in range(lenRecogSite):
if recogSiteCode[i] == seq2modify[i]:
continue
elif recogSiteCode[i] == 'N':
recogSiteCode = recogSiteCode[:i]+ seq2modify[i] + recogSiteCode[i+1:]
elif recogSiteCode[i] in bases:
baseIndex = np.where(bases==recogSiteCode[i])[0][0]
for j in basesArray[baseIndex]:
if j == seq2modify[i]:
recogSiteCode = recogSiteCode[:i]+ seq2modify[i] + recogSiteCode[i+1:]
break
else:
break
#print(recogSiteCode)
if seq2modify == recogSiteCode:
check = 0
if x%3 == 0:
for a in aminoAcidsArray:
if seq2modify[:3] in a:
for b in a:
if b == seq2modify[:3]:
continue
else:
sequence = sequence[:x] + b + sequence[x+3:]
check = check + 1
if firstNPos == 1:
if sequence[x] == subOrigSeq[x]:
sequence = subOrigSeq
check = 0
else:
Xs.append(x)
elif firstNPos == 2:
if sequence[x:x+2] == subOrigSeq[x:x+2]:
sequence = subOrigSeq
check = 0
else:
Xs.append(x)
else:
Xs.append(x)
break
else:
continue
elif x%3 == 1:
if firstNPos > 2:
for a in aminoAcidsArray:
if seq2modify[2:5] in a:
for b in a:
if b == seq2modify[2:5]:
continue
else:
sequence = sequence[:x+2] + b + sequence[x+5:]
check = check + 1
if firstNPos == 3:
if sequence[x+2] == subOrigSeq[x+2]:
sequence = subOrigSeq
check = 0
else:
Xs.append(x+2)
elif firstNPos == 4:
if sequence[x+2:x+4] == subOrigSeq[x+2:x+4]:
sequence = subOrigSeq
check = 0
else:
Xs.append(x+2)
else:
Xs.append(x+2)
break
else:
continue
elif x%3 == 2:
if firstNPos >1:
for a in aminoAcidsArray:
if seq2modify[1:4] in a:
for b in a:
if b == seq2modify[1:4]:
continue
else:
sequence = sequence[:x+1] + b + sequence[x+4:]
check = check + 1
if firstNPos == 2:
if sequence[x+1] == subOrigSeq[x+1]:
sequence = subOrigSeq
check = 0
else:
Xs.append(x+1)
elif firstNPos == 3:
if sequence[x+1:x+3] == subOrigSeq[x+1:x+3]:
sequence = subOrigSeq
check = 0
else:
Xs.append(x+1)
else:
Xs.append(x+1)
break
else:
continue
#looking at the end of the seq2modify in case looking at start is not fruitful
if check == 0:
#print(check)
LastBaseIdx = x+lenRecogSite-1
if (LastBaseIdx)%3 == 2:
for a in aminoAcidsArray:
if seq2modify[-3:] in a:
for b in a:
if b == seq2modify[-3:]:
continue
else:
sequence = sequence[:LastBaseIdx-2] + b + sequence[LastBaseIdx+1:]
if lenRecogSite -1 - lastNPos == 1:
if sequence[x+lastNPos+1] == subOrigSeq[x+lastNPos+1]:
sequence = subOrigSeq
else:
Xs.append(LastBaseIdx-2)
elif lenRecogSite - 1 - lastNPos == 2:
if sequence[x+lastNPos+1:x+lastNPos+3] == subOrigSeq[x+lastNPos+1:x+lastNPos+3]:
sequence = subOrigSeq
else:
Xs.append(LastBaseIdx-2)
else:
Xs.append(LastBaseIdx-2)
break
else:
continue
elif (LastBaseIdx)%3 == 1:
if lenRecogSite < 5:
continue
else:
for a in aminoAcidsArray:
if seq2modify[-5:-2] in a:
for b in a:
if b == seq2modify[-5:-2]:
continue
else:
sequence = sequence[:LastBaseIdx-4] + b + sequence[LastBaseIdx-1:]
if lenRecogSite - 1 - lastNPos < 3:
sequence = subOrigSeq
elif lenRecogSite - 1 - lastNPos == 3:
if sequence[x+lastNPos+1] == subOrigSeq[x+lastNPos+1]:
sequence = subOrigSeq
else:
Xs.append(LastBaseIdx-4)
elif lenRecogSite - 1 - lastNPos == 4:
if sequence[x+lastNPos+1:x+lastNPos+3] == subOrigSeq[x+lastNPos+1:x+lastNPos+3]:
sequence = subOrigSeq
else:
Xs.append(LastBaseIdx-4)
else:
Xs.append(LastBaseIdx-4)
break
else:
continue
elif (LastBaseIdx)%3 == 0:
if lenRecogSite < 4:
continue
else:
for a in aminoAcidsArray:
if seq2modify[-4:-1] in a:
for b in a:
if b == seq2modify[-4:-1]:
continue
else:
sequence = sequence[:LastBaseIdx-3] + b + sequence[LastBaseIdx:]
if lenRecogSite - 1 - lastNPos < 2:
sequence = subOrigSeq
elif lenRecogSite - 1 - lastNPos == 2:
if sequence[x+lastNPos+1] == subOrigSeq[x+lastNPos+1]:
sequence = subOrigSeq
else:
Xs.append(LastBaseIdx-3)
elif lenRecogSite - 1 - lastNPos == 3:
if sequence[x+lastNPos+1:x+lastNPos+3] == subOrigSeq[x+lastNPos+1:x+lastNPos+3]:
sequence = subOrigSeq
else:
Xs.append(LastBaseIdx-3)
else:
Xs.append(LastBaseIdx-3)
break
else:
continue
if sequence == origSequence:
sequence = "Could not modify sequence"
else:
for i in Xs:
sequence = sequence[:i] + ' *' + sequence[i:i+3] + '* ' + sequence[i+3:]
sequence = reverseStr(sequence, direction)
#print(sequence)
return sequence
#Final
def Final(*args,**kwargs):
sequence = Element('seq').value
direction = Element('dire').value
resEnzIndex = int(Element('enzyme').value)
msg = Element('output')
#msg.write(CheckSequenceLen(sequence))
if CheckSequenceLen(sequence) == "done":
if StartEndCodonCheck(sequence, direction) == "1":
msg.write(recogSiteModification(sequence, resEnzIndex, direction))
else:
msg.write(StartEndCodonCheck(sequence, direction))
else:
msg.write(CheckSequenceLen(sequence))