libStatGen Software 1
BamIndex.cpp
1/*
2 * Copyright (C) 2010-2012 Regents of the University of Michigan
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18#include "BamIndex.h"
19#include <iomanip>
20
21BamIndex::BamIndex()
22 : IndexBase(),
23 maxOverallOffset(0),
24 myUnMappedNumReads(-1)
25{
26}
27
28
29BamIndex::~BamIndex()
30{
31}
32
33
34// Reset the member data for a new index file.
36{
38
39 maxOverallOffset = 0;
40 myUnMappedNumReads = -1;
41}
42
43
44// Read & parse the specified index file.
46{
47 // Reset the index from anything that may previously be set.
48 resetIndex();
49
50 IFILE indexFile = ifopen(filename, "rb");
51
52 // Failed to open the index file.
53 if(indexFile == NULL)
54 {
55 return(SamStatus::FAIL_IO);
56 }
57
58 // generate the bam index structure.
59
60 // Read the magic string.
61 char magic[4];
62 if(ifread(indexFile, magic, 4) != 4)
63 {
64 // Failed to read the magic
65 ifclose(indexFile);
66 return(SamStatus::FAIL_IO);
67 }
68
69 // If this is not an index file, set num references to 0.
70 if (magic[0] != 'B' || magic[1] != 'A' || magic[2] != 'I' || magic[3] != 1)
71 {
72 // Not a BAM Index file.
73 ifclose(indexFile);
75 }
76
77 // It is a bam index file.
78 // Read the number of reference sequences.
79 if(ifread(indexFile, &n_ref, 4) != 4)
80 {
81 // Failed to read.
82 ifclose(indexFile);
83 return(SamStatus::FAIL_IO);
84 }
85
86 // Size the references.
87 myRefs.resize(n_ref);
88
89 for(int refIndex = 0; refIndex < n_ref; refIndex++)
90 {
91 // Read each reference.
92 Reference* ref = &(myRefs[refIndex]);
93
94 // Read the number of bins.
95 if(ifread(indexFile, &(ref->n_bin), 4) != 4)
96 {
97 // Failed to read the number of bins.
98 // Return failure.
99 ifclose(indexFile);
100 return(SamStatus::FAIL_PARSE);
101 }
102
103 // If there are no bins, then there are no
104 // mapped/unmapped reads.
105 if(ref->n_bin == 0)
106 {
107 ref->n_mapped = 0;
108 ref->n_unmapped = 0;
109 }
110
111 // Resize the bins so they can be indexed by bin number.
112 ref->bins.resize(ref->n_bin + 1);
113
114 // Read each bin.
115 for(int binIndex = 0; binIndex < ref->n_bin; binIndex++)
116 {
117 uint32_t binNumber;
118
119 // Read in the bin number.
120 if(ifread(indexFile, &(binNumber), 4) != 4)
121 {
122 // Failed to read the bin number.
123 // Return failure.
124 ifclose(indexFile);
125 return(SamStatus::FAIL_IO);
126 }
127
128 // Add the bin to the reference and get the
129 // pointer back so the values can be set in it.
130 Bin* binPtr = &(ref->bins[binIndex]);
131 binPtr->bin = binNumber;
132
133 // Read in the number of chunks.
134 if(ifread(indexFile, &(binPtr->n_chunk), 4) != 4)
135 {
136 // Failed to read number of chunks.
137 // Return failure.
138 ifclose(indexFile);
139 return(SamStatus::FAIL_IO);
140 }
141
142 // Read in the chunks.
143 // Allocate space for the chunks.
144 uint32_t sizeOfChunkList = binPtr->n_chunk * sizeof(Chunk);
145 binPtr->chunks = (Chunk*)malloc(sizeOfChunkList);
146 if(ifread(indexFile, binPtr->chunks, sizeOfChunkList) != sizeOfChunkList)
147 {
148 // Failed to read the chunks.
149 // Return failure.
150 ifclose(indexFile);
151 return(SamStatus::FAIL_IO);
152 }
153
154 // Determine the min/max for this bin if it is not the max bin.
155 if(binPtr->bin != MAX_NUM_BINS)
156 {
157 for(int i = 0; i < binPtr->n_chunk; i++)
158 {
159 if(binPtr->chunks[i].chunk_beg < ref->minChunkOffset)
160 {
161 ref->minChunkOffset = binPtr->chunks[i].chunk_beg;
162 }
163 if(binPtr->chunks[i].chunk_end > ref->maxChunkOffset)
164 {
165 ref->maxChunkOffset = binPtr->chunks[i].chunk_end;
166 }
167 if(binPtr->chunks[i].chunk_end > maxOverallOffset)
168 {
169 maxOverallOffset = binPtr->chunks[i].chunk_end;
170 }
171 }
172 }
173 else
174 {
175 // Mapped/unmapped are the last chunk of the
176 // MAX BIN
177 ref->n_mapped = binPtr->chunks[binPtr->n_chunk - 1].chunk_beg;
178 ref->n_unmapped = binPtr->chunks[binPtr->n_chunk - 1].chunk_end;
179 }
180 }
181
182 // Read the number of intervals.
183 if(ifread(indexFile, &(ref->n_intv), 4) != 4)
184 {
185 // Failed to read, set to 0.
186 ref->n_intv = 0;
187 // Return failure.
188 ifclose(indexFile);
189 return(SamStatus::FAIL_IO);
190 }
191
192 // Allocate space for the intervals and read them.
193 uint32_t linearIndexSize = ref->n_intv * sizeof(uint64_t);
194 ref->ioffsets = (uint64_t*)malloc(linearIndexSize);
195 if(ifread(indexFile, ref->ioffsets, linearIndexSize) != linearIndexSize)
196 {
197 // Failed to read the linear index.
198 // Return failure.
199 ifclose(indexFile);
200 return(SamStatus::FAIL_IO);
201 }
202 }
203
204 int32_t numUnmapped = 0;
205 if(ifread(indexFile, &numUnmapped, sizeof(int32_t)) == sizeof(int32_t))
206 {
207 myUnMappedNumReads = numUnmapped;
208 }
209
210 // Successfully read the bam index file.
211 ifclose(indexFile);
212 return(SamStatus::SUCCESS);
213}
214
215
216// Get the chunks for the specified reference id and start/end 0-based
217// coordinates.
218bool BamIndex::getChunksForRegion(int32_t refID, int32_t start, int32_t end,
219 SortedChunkList& chunkList)
220{
221 chunkList.clear();
222
223 // If start is >= to end, there will be no sections, return no
224 // regions.
225 if((start >= end) && (end != -1))
226 {
227 std::cerr << "Warning, requesting region where start <= end, so "
228 << "no values will be returned.\n";
229 return(false);
230 }
231
232 // Handle REF_ID_UNMAPPED. This uses a default chunk which covers
233 // from the max offset to the end of the file.
234 if(refID == REF_ID_UNMAPPED)
235 {
236 Chunk refChunk;
237 // The start of the unmapped region is the max offset found
238 // in the index file.
239 refChunk.chunk_beg = getMaxOffset();
240 // The end of the unmapped region is the end of the file, so
241 // set chunk end to the max value.
242 refChunk.chunk_end = Chunk::MAX_CHUNK_VALUE;
243 return(chunkList.insert(refChunk));
244 }
245
246 if((refID < 0) || (refID >= n_ref))
247 {
248 // The specified refID is out of range, return false.
249 std::cerr << "Warning, requesting refID is out of range, so "
250 << "no values will be returned.\n";
251 return(false);
252 }
253
254 const Reference* ref = &(myRefs[refID]);
255
256 // Handle where start/end are defaults.
257 if(start == -1)
258 {
259 if(end == -1)
260 {
261 // This is whole chromosome, so take a shortcut.
262 if(ref->maxChunkOffset == 0)
263 {
264 // No chunks for this region, but this is not an error.
265 return(true);
266 }
267 Chunk refChunk;
268 refChunk.chunk_beg = ref->minChunkOffset;
269 refChunk.chunk_end = ref->maxChunkOffset;
270 return(chunkList.insert(refChunk));
271 }
272 else
273 {
274 start = 0;
275 }
276 }
277 if(end == -1)
278 {
279 // MAX_POSITION is inclusive, but end is exclusive, so add 1.
280 end = MAX_POSITION + 1;
281 }
282
283 // Determine the minimum offset for the given start position. This
284 // is done by using the linear index for the specified start position.
285 uint64_t minOffset = 0;
286 getMinOffsetFromLinearIndex(refID, start, minOffset);
287
288 bool binInRangeMap[MAX_NUM_BINS+1];
289
290 getBinsForRegion(start, end, binInRangeMap);
291
292 // Loop through the bins in the ref and if they are in the region, get the chunks.
293 for(int i = 0; i < ref->n_bin; ++i)
294 {
295 const Bin* bin = &(ref->bins[i]);
296 if(binInRangeMap[bin->bin] == false)
297 {
298 // This bin is not in the region, so check the next one.
299 continue;
300 }
301
302 // Add each chunk in the bin to the map.
303 for(int chunkIndex = 0; chunkIndex < bin->n_chunk; chunkIndex++)
304 {
305 // If the end of the chunk is less than the minimum offset
306 // for the 16K block that starts our region, then no
307 // records in this chunk will cross our region, so do
308 // not add it to the chunks we need to use.
309 if(bin->chunks[chunkIndex].chunk_end < minOffset)
310 {
311 continue;
312 }
313 // Add the chunk to the map.
314 if(!chunkList.insert(bin->chunks[chunkIndex]))
315 {
316 // Failed to add to the map, return false.
317 std::cerr << "Warning, Failed to add a chunk, so "
318 << "no values will be returned.\n";
319 return(false);
320 }
321 }
322 }
323
324 // Now that all chunks have been added to the list,
325 // handle overlapping chunks.
326 return(chunkList.mergeOverlapping());
327}
328
329
330// Get the max offset.
331uint64_t BamIndex::getMaxOffset() const
332{
333 return(maxOverallOffset);
334}
335
336// Get the min & max file offsets for the reference ID.
338 uint64_t& minOffset,
339 uint64_t& maxOffset) const
340{
341 if((refID < 0) || (refID >= (int32_t)myRefs.size()))
342 {
343 // Reference ID is out of range for this index file.
344 return(false);
345 }
346
347 // Get this reference.
348 minOffset = myRefs[refID].minChunkOffset;
349 maxOffset = myRefs[refID].maxChunkOffset;
350 return(true);
351}
352
353
354// Get the number of mapped reads for this reference id.
355int32_t BamIndex::getNumMappedReads(int32_t refID)
356{
357 // If it is the reference id of unmapped reads, return
358 // that there are no mapped reads.
359 if(refID == REF_ID_UNMAPPED)
360 {
361 // These are by definition all unmapped reads.
362 return(0);
363 }
364
365 if((refID < 0) || (refID >= (int32_t)myRefs.size()))
366 {
367 // Reference ID is out of range for this index file.
368 return(-1);
369 }
370
371 // Get this reference.
372 return(myRefs[refID].n_mapped);
373}
374
375
376// Get the number of unmapped reads for this reference id.
377int32_t BamIndex::getNumUnMappedReads(int32_t refID)
378{
379 // If it is the reference id of unmapped reads, return
380 // that value.
381 if(refID == REF_ID_UNMAPPED)
382 {
383 return(myUnMappedNumReads);
384 }
385
386 if((refID < 0) || (refID >= (int32_t)myRefs.size()))
387 {
388 // Reference ID is out of range for this index file.
389 return(-1);
390 }
391
392 // Get this reference.
393 return(myRefs[refID].n_unmapped);
394}
395
396
397// Print the bam index.
398void BamIndex::printIndex(int32_t refID, bool summary)
399{
400 std::cout << "BAM Index: " << std::endl;
401 std::cout << "# Reference Sequences: " << n_ref << std::endl;
402
403 unsigned int startRef = 0;
404 unsigned int endRef = myRefs.size() - 1;
405 std::vector<Reference> refsToProcess;
406 if(refID != -1)
407 {
408 // Set start and end ref to the specified reference id.
409 startRef = refID;
410 endRef = refID;
411 }
412
413 // Print out the information for each bin.
414 for(unsigned int i = startRef; i <= endRef; ++i)
415 {
416 std::cout << std::dec
417 << "\tReference ID: " << std::setw(4) << i
418 << "; #Bins: "<< std::setw(6) << myRefs[i].n_bin
419 << "; #Linear Index Entries: "
420 << std::setw(6) << myRefs[i].n_intv
421 << "; Min Chunk Offset: "
422 << std::setw(18) << std::hex << std::showbase << myRefs[i].minChunkOffset
423 << "; Max Chunk Offset: "
424 << std::setw(18) << myRefs[i].maxChunkOffset
425 << std::dec;
426 // Print the mapped/unmapped if set.
427 if(myRefs[i].n_mapped != Reference::UNKNOWN_MAP_INFO)
428 {
429 std::cout << "; " << myRefs[i].n_mapped << " Mapped Reads";
430 }
431 if(myRefs[i].n_mapped != Reference::UNKNOWN_MAP_INFO)
432 {
433 std::cout << "; " << myRefs[i].n_unmapped << " Unmapped Reads";
434 }
435 std::cout << std::endl;
436
437 // Only print more details if not summary.
438 if(!summary)
439 {
440 std::vector<Bin>::iterator binIter;
441 for(binIter = myRefs[i].bins.begin();
442 binIter != myRefs[i].bins.end();
443 ++binIter)
444 {
445 Bin* binPtr = &(*binIter);
446 if(binPtr->bin == Bin::NOT_USED_BIN)
447 {
448 // This bin is not used, continue.
449 continue;
450 }
451 // Print the bin info.
452 std::cout << "\t\t\tBin Name: " << binPtr->bin << std::endl;
453 std::cout << "\t\t\t# Chunks: " << binPtr->n_chunk << std::endl;
454 std::cout << std::hex << std::showbase;
455
456 for(int chunkIndex = 0; chunkIndex < binPtr->n_chunk;
457 ++chunkIndex)
458 {
459 // If this is the last chunk of the MAX_NUM_BINS - it
460 // contains a mapped/unmapped count rather than the regular
461 // chunk addresses.
462 if((binPtr->bin != MAX_NUM_BINS) ||
463 (chunkIndex != (binPtr->n_chunk - 1)))
464 {
465 std::cout << "\t\t\t\tchunk_beg: "
466 << binPtr->chunks[chunkIndex].chunk_beg
467 << std::endl;
468 std::cout << "\t\t\t\tchunk_end: "
469 << binPtr->chunks[chunkIndex].chunk_end
470 << std::endl;
471 }
472 }
473 }
474 std::cout << std::dec;
475
476 // Print the linear index.
477 for(int linearIndex = 0; linearIndex < myRefs[i].n_intv;
478 ++linearIndex)
479 {
480 if(myRefs[i].ioffsets[linearIndex] != 0)
481 {
482 std::cout << "\t\t\tLinearIndex["
483 << std::dec << linearIndex << "] Offset: "
484 << std::hex << myRefs[i].ioffsets[linearIndex]
485 << std::endl;
486 }
487 }
488 }
489 }
490}
IFILE ifopen(const char *filename, const char *mode, InputFile::ifileCompression compressionMode=InputFile::DEFAULT)
Open a file with the specified name and mode, using a filename of "-" to indicate stdin/stdout.
Definition: InputFile.h:562
int ifclose(IFILE &file)
Close the file.
Definition: InputFile.h:580
unsigned int ifread(IFILE file, void *buffer, unsigned int size)
Read up to size bytes from the file into the buffer.
Definition: InputFile.h:600
virtual void resetIndex()
Reset the member data for a new index file.
Definition: BamIndex.cpp:35
bool getChunksForRegion(int32_t refID, int32_t start, int32_t end, SortedChunkList &chunkList)
Get the list of chunks associated with this region.
Definition: BamIndex.cpp:218
int32_t getNumMappedReads(int32_t refID)
Get the number of mapped reads for this reference id.
Definition: BamIndex.cpp:355
SamStatus::Status readIndex(const char *filename)
Definition: BamIndex.cpp:45
bool getReferenceMinMax(int32_t refID, uint64_t &minOffset, uint64_t &maxOffset) const
Get the minimum and maximum file offsets for the specfied reference ID.
Definition: BamIndex.cpp:337
int32_t getNumUnMappedReads(int32_t refID)
Get the number of unmapped reads for this reference id.
Definition: BamIndex.cpp:377
static const int32_t REF_ID_UNMAPPED
The number used for the reference id of unmapped reads.
Definition: BamIndex.h:86
void printIndex(int32_t refID, bool summary=false)
Print the index information.
Definition: BamIndex.cpp:398
virtual void resetIndex()
Reset the member data for a new index file.
Definition: IndexBase.cpp:130
Class for easily reading/writing files without having to worry about file type (uncompressed,...
Definition: InputFile.h:37
Status
Return value enum for StatGenFile methods.
Definition: StatGenStatus.h:32
@ SUCCESS
method completed successfully.
Definition: StatGenStatus.h:32
@ FAIL_IO
method failed due to an I/O issue.
Definition: StatGenStatus.h:37
@ FAIL_PARSE
failed to parse a record/header - invalid format.
Definition: StatGenStatus.h:42