1
2 '''
3 Created 2012
4
5 Contains various help functions which read or produce an input/ output
6
7
8 @author: Sven Giese
9 '''
10 import os
11 import random
12 import HTSeq
13
14
16 """
17 Reads in the dna sequence of the given fasta
18
19 @type filename: string
20 @param filename: Fasta-file used as input.
21 @rtype: HTSeq Sequence object
22 @return: Reference Fasta.
23 """
24 chr = HTSeq.FastaReader(filename)
25 for fasta in chr:
26 referenz = HTSeq.Sequence(fasta.seq,fasta.name)
27 return(referenz)
28
29
31 """
32 Writes a given sequence object to a fasta file.
33
34 @type sequenceObject: HTSeq Sequence object
35 @param sequenceObject: Reference sequence as fasta.
36 """
37
38 outfasta = open(filename,"w")
39 sequenceObject.write_to_fasta_file(outfasta)
40 outfasta.close()
41
42
44 """
45 Creates the "delta" file for the comparison of the two chromosoms. This file contains the differences in nucleotide distribution between reference and artificial.
46 input: nucleotid dictionary genom, aa dictionary genome, nucleotid dictionary artificial chromosom, aa dictionary, filename
47
48 @type Ndic_G: dictionary
49 @param Ndic_G: Nucleotid dictionary genom.
50 @type aadic_G: dictionary
51 @param aadic_G: AA dictionary genome.
52 @type Ndic_AR: dictionary
53 @param Ndic_AR: Nucleotid dictionary artificial.
54 @type aadic_AR: dictionary
55 @param aadic_AR: AA dictionary artificial
56 @type filename: string
57 @param filename: Output filename.
58 """
59 fobj = open(filename,"w")
60 fobj.write("NUC /AA \t Genom \t Artificial Reference \t Delta \n")
61
62 sum1 =0
63 sum2= 0
64 for item in Ndic_G.keys():
65 fobj.write(item +"\t"+str(Ndic_G[item])+"\t"+str(Ndic_AR[item])+"\t"+str(Ndic_G[item]-Ndic_AR[item])+"\n")
66 sum1 +=abs(Ndic_G[item]-Ndic_AR[item])
67 fobj.write(str(sum1)+"\n")
68
69 for item in aadic_G.keys():
70 fobj.write(item +"\t"+str(aadic_G[item])+"\t"+str(aadic_AR[item])+"\t"+str(aadic_G[item]-aadic_AR[item])+"\n")
71 sum2 +=abs(aadic_G[item]-aadic_AR[item])
72 fobj.write(str(sum2)+"\n")
73
74
75
76
78 """
79 Writes the nucleotide distribution in a file and returns the dictionary. adjust s for % results.
80 @type seq: string
81 @param seq: Nucleotide sequence.
82 @type txt_file: string
83 @param txt_file: Output compare file.
84 @type shallwrite: Bool
85 @param shallwrite: Decides if percentages values are written to the output.
86 """
87 Nndic={"A":0,"C":0,"G":0,"T":0,"N":0}
88
89 for i in range(0,len(seq)):
90 Nndic[seq[i]]+=1
91 s=len(seq)
92 s=1
93
94 if (shallwrite==1):
95 output_file=open(txt_file,'w')
96 for item in Nndic.keys():
97 Nndic[item]=Nndic[item]/float(s)
98 output_file.write(item + "\t" + str(Nndic[item])+"\n")
99
100 output_file.close()
101 else:
102 for item in Nndic.keys():
103 Nndic[item]=Nndic[item]/float(s)
104 return (Nndic)
105
106
107
109 """
110 Writes the AA distribution in a file and returns the dictionary. adjust s for % results.
111 @type seq: string
112 @param seq: Nucleotide sequence.
113 @type txt_file: string
114 @param txt_file: Output compare file.
115 @type shallwrite: Bool
116 @param shallwrite: Write output in percentages..
117 """
118 aadic = {"A":0,"R":0,"N":0,"D":0,"C":0,"E":0,"Q":0,"G":0,"H":0,"I":0,"L":0,"K":0,"M":0,"F":0,"P":0,"S":0,"T":0,"W":0,"Y":0,"V":0,"*":0}
119 for i in range(0,len(seq)):
120
121 '''escape 'n' Sequences '''
122 if (seq[i] in aadic):
123 aadic[seq[i]]+=1
124 else:
125 continue
126
127
128 n = len(seq)
129 n=1
130 if (shallwrite==1):
131 output_file=open(txt_file,'w')
132 for item in aadic.keys():
133 aadic[item]=aadic[item]/float(n)
134 output_file.write(item + "\t" + str(aadic[item])+"\n")
135
136 output_file.close()
137 else:
138 for item in aadic.keys():
139 aadic[item]=aadic[item]/float(n)
140
141 return (aadic)
142
143 '''
144 input: DNA Sequence, outputfilename and 1/0 for writing/not writing outputfile '''
145
147 """
148 Writes the DNA distribution in a file and returns the dictionary. adjust n for % results
149
150 @type file_fasta: string
151 @param file_fasta: DNA Sequence
152 @type txt_file: string
153 @param txt_file: Filename for output.
154 """
155 input_file=open(file_fasta,'r')
156 output_file=open(txt_file,'a')
157 seq=''
158 for line in input_file:
159 if line[0]!='>':
160 line=line.rstrip()
161 seq+=line
162 output_file.write(str(nucleotide_dist_seq(seq)))
163 output_file.write('\n')
164 output_file.close()
165 input_file.close()
166
167
168 '''gets the number of missmatches between 2 sequences
169 input: orig sequence, decoy sequence '''
171 """
172 Calculates the hamming distances between two sequences.
173 @type original: list
174 @param original: Nucleotide sequence from the reference.
175 @type artificial: list
176 @param artificial: Nucleotide sequence from the artificial reference.
177 """
178 hamming = 0
179 not_hamming=0
180 for i in range(0,len(original)):
181 if (original[i]!=artificial[i]):
182 hamming +=1
183
184 else:
185 not_hamming+=1
186 print ("#hamming distance REF-ART\t"+ str(hamming))
187 print ("avg. distance:\t" + str(len(original)/float(hamming)))
188 print("###########################\r\n")
189