Package core :: Module InsertMutations
[hide private]
[frames] | no frames]

Source Code for Module core.InsertMutations

  1  #!/usr/bin/python3
 
  2  """
 
  3  Created 2012
 
  4  core Script for the generation of the artificial reference genome
 
  5  
 
  6  The functions purpose is to to go through a list of positions and find balanced mutations
 
  7  which fulfill the demands on the artificial reference genome. Once a initial start positions is
 
  8  randomly selected all possible triplets with hamming distance 1 are generated and looked up 
 
  9  in a dictionary which contains all triplet positions in the input genome. If a suitable partner
 
 10  is found for the initial mutation the next start positions is chosen randomly. Else: try all other
 
 11  triplets with hamming distance 1 until no one is left. This process can be accelerated by allowing
 
 12  unbalanced mutations, but this will cause differences in the NUC/AA distribution and the AA neighborhood.
 
 13  
 
 14  
 
 15  @author: Sven Giese
 
 16  """ 
 17  
 
 18  import random as r 
 19  import Prep as INI 
 20  
 
 21  
 
 22  
 
23 -def getMutation(AA,Codon):
24 """ 25 Returns a random mutation for a given AA and its Codon(DNA). The mutation is done in a way which supports the equilibrium 26 of the nucleotide distribution by only regarding hamming distance=1 Codons as possible mutations 27 28 @type AA: string 29 @param AA: Single AA. 30 @type Codon: string 31 @param Codon: 3-letter dna code. 32 @rtype: list,list 33 @return: A list of all valid mutations (triplet) and the coresponding AA. 34 """ 35 temp_mutationlist = [] 36 '''create a list of possible triplets within hamming distance 1 ''' 37 for item in INI.genetic_code.keys(): 38 isvalid = INI.isvalidtriplet(item,Codon) 39 ''' Hamming distance 1, AA is not equal to the given AA,forbid mutation to stopcodon ''' 40 if (isvalid == True and AA !=INI.genetic_code[item] and INI.genetic_code[item]!="*"): 41 temp_mutationlist.append(item) 42 43 44 aalist = [] 45 # generate a list of all possible amino acids resulting from the temp_mutationlist 46 for item in temp_mutationlist: 47 if (item in INI.genetic_code): 48 aalist.append(INI.genetic_code[item]) 49 else: 50 aalist.append("n") 51 52 return(temp_mutationlist,aalist)
53 54
55 -def getdifference(triplet_old,triplet_new):
56 """ 57 Given two triplets, returns the differences between them plus the position 58 59 @type triplet_old: string 60 @param triplet_old: AA triplet. 61 @type triplet_new: string 62 @param triplet_new: AA triplet. 63 @rtype: Char,Char,int 64 @return: The new aminoacid, the old aminoacid and the position. 65 """ 66 for i in range(0,3): 67 if (triplet_new[i]!=triplet_old[i]): 68 69 return (triplet_new[i],triplet_old[i],i)
70 71
72 -def isvalidposition(pdic,iprime,distance):
73 """ 74 Checks if a position is valid for mutation. It queries all neighboring positions (iprime +-distance) to check whether there already was a mutation in pdic 75 76 @type pdic: dictionary 77 @param pdic: Diciontary containing mutations and start/ stop codons.. 78 @type iprime: int 79 @param iprime: Position of the prospective mutation (DNA level) 80 @type distance: int 81 @param distance: User defined parameter which limits the distance between two mutations. 82 @rtype: Bool 83 @return: Boolean which decides if the position is valid (1= yes,0 = no) 84 """ 85 86 # deal with base shifts 87 distance = distance-2 88 89 istforbidden = 0 90 for o in range(-distance,distance+2,1): 91 if (iprime+o in pdic): 92 # E = end of orf 93 # S = start of orf 94 if((pdic[iprime+o]=="E") or (pdic[iprime+o]=="S")): 95 if((o >3) or (o <-3)): 96 pass 97 else: 98 istforbidden = 1 99 break 100 else: 101 istforbidden = 1 102 break 103 else: 104 pass 105 106 return(istforbidden)
107 108 109 110
111 -def mutate_random(DNA,AminoAcid,distance,pdic,rev,header,Random,outputpath):
112 """ 113 Mutates a given DNA(AminoAcid) Genomesequence on several positions (distance based on DISTANCE var. If one mutation is done 114 a compareable Triplet is searched to "reverse" the changes made in AA distribution, N distribution, AA neighborhood 115 116 @type DNA: list 117 @param DNA: DNA sequence of the reference genome. 118 @type AminoAcid: list 119 @param AminoAcid: AA sequence of the reference genome. 120 @type rev: Bool 121 @param rev: Boolean which decides if unbalanced mutations are allowed (only initial mutation is performed) 122 @type pdic: dictionary 123 @param pdic: Diciontary containing mutations and start/ stop codons.. 124 @type header: string 125 @param header: Header for the resulting artificial reference file (fasta format). 126 @type Random: Bool 127 @param Random: Boolean for choosing on of the mutation modes (linear = 0,random = 1) 128 @type distance: int 129 @param distance: User defined parameter which limits the distance between two mutations. 130 @rtype: list 131 @return: Artificial reference genome sequence. 132 """ 133 ##debug vals 134 start = [] # list of start positions of mutations ( start means first mutation in balanced case) 135 both = [] # start and end position 136 fobj2= open(outputpath+header+"_CompleteLog.txt","a") 137 fobj2.write("BalancedMutation"+"\t"+"NewAA" + "\t" + "OldAA"+"\t"+"NewAAPos"+"\t"+"OldAAPos" +"\t"+ "NewDNA"+"\t"+ "OldDNA"+ "\t"+"NewDNAPos"+"\t"+"OldDNAPos"+"\n") 138 fobj2.close() 139 140 141 # generate start positions for mutation (the samplespace) 142 samplespace = [] 143 for i in range (2,len(AminoAcid),distance/3): 144 samplespace.append(i) 145 146 147 ##random_modification 148 if (Random ==1): 149 r.shuffle(samplespace) 150 else: 151 pass 152 153 dna_list = list(DNA) 154 AminoAcid_list = list(AminoAcid) 155 156 '''the lookup dictionary for the aa triplets ''' 157 lookup_dic = INI.createdic(AminoAcid) 158 159 #gotit indicator if a possibility was found to revert the initial changes (start of mutation) 160 gotit=False 161 # stat variables 162 succ_counter = 0 163 fail_counter = 0 164 skip = 0 165 166 ''' Main loop over the AminoAcid''' 167 for i in samplespace: 168 ''' no triplet left --> break ''' 169 if(i+2 >len(AminoAcid)): 170 print("\t(finished...exceeded length of AA)") 171 continue 172 173 ''' AA which is going to be mutated''' 174 AA = AminoAcid_list[i] 175 176 '''index for dna : i*3 --> AminoAcid --> DNA 177 #not i*3+3 because i starts at AA 2 since we need a right and left neighbor''' 178 iprime = i*3 179 180 '''AA and corresponding DNA triplet for the middle AA ''' 181 AA_triplet= AminoAcid_list[i-1]+AminoAcid_list[i]+AminoAcid_list[i+1] 182 DNA_triplet = DNA[iprime:iprime+3] 183 184 # get temporary list of all mutations. Iterate over it to find best possible substitution 185 mutationsliste,aaliste = getMutation(AA,DNA_triplet) 186 187 188 # isvalidposition returns 1 if the position isforbidden, else 0 189 val = isvalidposition(pdic, iprime, distance) 190 if (val ==1): 191 skip+=1 192 fobj2= open(outputpath+header+"_CompleteLog.txt","a") 193 fobj2.write(str(0)+"\t"+new_AA_triplet + "\t" + "' '"+"\t"+str(i)+"\t"+"' '" +"\t"+ new_triplet+"\t"+ "' '"+ "\t"+str(iprime+position)+"\t"+"'(skipped)'"+"\n") 194 fobj2.close() 195 continue 196 197 else: 198 pass 199 200 201 for q,item in enumerate(mutationsliste): 202 203 if gotit==True: 204 break 205 else: 206 pass 207 208 ''' old and new variables for before/after the mutation ''' 209 new_triplet = mutationsliste[q] 210 new_AA = aaliste[q] 211 new_N,old_N,position = getdifference(DNA_triplet,new_triplet) 212 new_AA_triplet = AA_triplet[0]+new_AA+AA_triplet[2] 213 tempdic = pdic 214 tempdic[iprime+position]="M" 215 216 if (new_AA_triplet in lookup_dic): 217 '''templist--> contains all starting positions of the "new_AA_triplet" which we want to substitute back ''' 218 templist = lookup_dic[new_AA_triplet] 219 220 221 # add potential mutation to dictionary 222 tempposition = [iprime+position,"M"] 223 for l in range(0,len(templist)): 224 posi = templist[l] 225 # i*3 --> protein nach DNA, +3 betrachten IMMER mittlere AA 226 ''' suitable dna position found? ''' 227 if (new_triplet == dna_list[posi*3+3]+dna_list[posi*3+3+1]+dna_list[posi*3+3+2]): 228 val = isvalidposition(tempdic, posi*3+3+position, distance) 229 230 if (val ==1): 231 skip+=1 232 continue 233 else: 234 pass 235 236 '''back substitution & do subs on 1st position''' 237 pdic[posi*3+3+position]="R" 238 dna_list[posi*3+3+position]= old_N 239 240 pdic[iprime+position]="M" 241 dna_list[iprime+position]= new_N 242 243 AminoAcid_list[i]= new_AA 244 AminoAcid_list[posi+1]= AA 245 246 gotit = True 247 succ_counter+=1 248 #lookup_dic[new_AA_triplet] = [i for i in lookup_dic[new_AA_triplet] if i!=posi] 249 lookup_dic[new_AA_triplet].remove(posi) 250 251 '''writing the log file ''' 252 fobj= open(outputpath+header+"_CompleteLog.txt","a") 253 fobj.write(str(1)+"\t"+AA_triplet + "\t" + new_AA_triplet+"\t"+str(i)+"\t"+str(posi) +"\t"+ DNA_triplet+"\t"+ str(new_triplet)+ "\t"+str(iprime+position)+"\t"+str(posi*3+3+position)+"\n") 254 fobj.close() 255 256 ## statistics 257 start.append(iprime+position) 258 both.extend([iprime+position,posi*3+3+position]) 259 break 260 261 # no possible triplet positions for back substitution in lookup_dic 262 else: 263 continue 264 265 # after loop 266 if (gotit==False): 267 fobj2= open(outputpath+header+"_CompleteLog.txt","a") 268 fobj2.write(str(0)+"\t"+new_AA_triplet + "\t" + "' '"+"\t"+str(i)+"\t"+"' '" +"\t"+ new_triplet+"\t"+ "' '"+ "\t"+str(iprime+position)+"\t"+"'(tried)'"+"\n") 269 fobj2.close() 270 fail_counter+=1 271 # reverse substitutions on? (=1) off (=0). If one dont change first mutation in the first place. Else: just change it.. 272 if (rev==0): 273 pdic[iprime+position]="M" 274 dna_list[iprime+position]= new_N 275 AminoAcid_list[i]= new_AA 276 start.append(iprime+position) 277 both.extend([iprime+position]) 278 elif (gotit==True): 279 gotit = False 280 281 # stats (INI.savepickle(pdic,header+"_pdic_e")) 282 print("\r\n########Some stats:########") 283 print("DNA length:\t" + str(len(DNA))) 284 print("max substitutions:\t" + str(len(DNA)/distance)) 285 print("#Balanced Mutations:\t" + str(succ_counter)) 286 287 288 return ("".join(dna_list))
289