libStatGen Software 1
ReferenceSequence.cpp
1/*
2 * Copyright (C) 2010 Regents of the University of Michigan
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18#include "assert.h"
19#include "ctype.h"
20#include "stdio.h"
21#include "Error.h"
22
23
24#include "Generic.h"
25#include "ReferenceSequence.h"
26
27#include <algorithm>
28#include <istream>
29#include <fstream>
30#include <sstream>
31#include <stdexcept>
32
33//
34// Given a buffer with a fasta format contents, count
35// the number of chromsomes in it and return that value.
36//
37bool getFastaStats(const char *fastaData, size_t fastaDataSize, uint32_t &chromosomeCount, uint64_t &baseCount)
38{
39 chromosomeCount = 0;
40 baseCount = 0;
41 bool atLineStart = true;
42
43 //
44 // loop over the fasta file, essentially matching for the
45 // pattern '^>.*$' and counting them.
46 //
47 for (size_t fastaIndex = 0; fastaIndex < fastaDataSize; fastaIndex++)
48 {
49 switch (fastaData[fastaIndex])
50 {
51 case '\n':
52 case '\r':
53 atLineStart = true;
54 break;
55 case '>':
56 {
57 if (!atLineStart) break;
58 chromosomeCount++;
59 //
60 // eat the rest of the line
61 //
62 while (fastaIndex < fastaDataSize && fastaData[fastaIndex]!='\n' && fastaData[fastaIndex]!='\r')
63 {
64 fastaIndex++;
65 }
66 break;
67 }
68 default:
69 baseCount++;
70 atLineStart = false;
71 break;
72 }
73
74 }
75 return false;
76}
77
78#if 0
79// turn this into a template on read/quality/etc...
80int GenomeSequence::debugPrintReadValidation(
81 std::string &read,
82 std::string &quality,
83 char direction,
84 genomeIndex_t readLocation,
85 int sumQuality,
86 int mismatchCount,
87 bool recurse
88)
89{
90 int validateSumQ = 0;
91 int validateMismatchCount = 0;
92 int rc = 0;
93 std::string genomeData;
94
95 for (uint32_t i=0; i<read.size(); i++)
96 {
97 if (tolower(read[i]) != tolower((*this)[readLocation + i]))
98 {
99 validateSumQ += quality[i] - '!';
100 // XXX no longer valid:
101 if (direction=='F' ? i<24 : (i >= (read.size() - 24))) validateMismatchCount++;
102 genomeData.push_back(tolower((*this)[readLocation + i]));
103 }
104 else
105 {
106 genomeData.push_back(toupper((*this)[readLocation + i]));
107 }
108 }
109 assert(validateSumQ>=0);
110 if (validateSumQ != sumQuality && validateMismatchCount == mismatchCount)
111 {
112 printf("SUMQ: Original Genome: %s test read: %s : actual sumQ = %d, test sumQ = %d\n",
113 genomeData.c_str(),
114 read.c_str(),
115 validateSumQ,
116 sumQuality
117 );
118 rc++;
119 }
120 else if (validateSumQ == sumQuality && validateMismatchCount != mismatchCount)
121 {
122 printf("MISM: Original Genome: %s test read: %s : actual mismatch %d test mismatches %d\n",
123 genomeData.c_str(),
124 read.c_str(),
125 validateMismatchCount,
126 mismatchCount
127 );
128 rc++;
129 }
130 else if (validateSumQ != sumQuality && validateMismatchCount != mismatchCount)
131 {
132 printf("BOTH: Original Genome: %s test read: %s : actual sumQ = %d, test sumQ = %d, actual mismatch %d test mismatches %d\n",
133 genomeData.c_str(),
134 read.c_str(),
135 validateSumQ,
136 sumQuality,
137 validateMismatchCount,
138 mismatchCount
139 );
140 rc++;
141 }
142
143 if (recurse && abs(validateMismatchCount - mismatchCount) > (int) read.size()/2)
144 {
145 printf("large mismatch difference, trying reverse strand: ");
146 std::string reverseRead = read;
147 std::string reverseQuality = quality;
148 getReverseRead(reverseRead);
149 reverse(reverseQuality.begin(), reverseQuality.end());
150 rc = debugPrintReadValidation(reverseRead, reverseQuality, readLocation, sumQuality, mismatchCount, false);
151 }
152 return rc;
153}
154#endif
155
156
157