libStatGen Software 1
Cigar.h
1/*
2 * Copyright (C) 2010-2011 Regents of the University of Michigan
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18#if !defined(_CIGAR_H)
19#define _CIGAR_H
20
21#include <string.h> // for inline use of strcat, etc
22#include <limits.h> // for INT_MAX
23#include <stdint.h> // for uint32_t and friends
24
25
26#include <assert.h>
27#include <stdio.h>
28#include <stdlib.h>
29#include <string>
30#include <iostream>
31#include <iomanip>
32#include <utility>
33#include <vector>
34
35#include "Generic.h"
36#include "StringBasics.h"
37
38/// This class represents the CIGAR without any methods to set the cigar
39/// (see CigarRoller for that).
40
41//
42// Docs from Sam1.pdf:
43//
44// Clipped alignment. In Smith-Waterman alignment, a sequence may not be aligned from the first residue to the last one.
45// Subsequences at the ends may be clipped off. We introduce operation ʻSʼ to describe (softly) clipped alignment. Here is
46// an example. Suppose the clipped alignment is:
47// REF: AGCTAGCATCGTGTCGCCCGTCTAGCATACGCATGATCGACTGTCAGCTAGTCAGACTAGTCGATCGATGTG
48// READ: gggGTGTAACC-GACTAGgggg
49// where on the read sequence, bases in uppercase are matches and bases in lowercase are clipped off. The CIGAR for
50// this alignment is: 3S8M1D6M4S.
51//
52//
53// If the mapping position of the query is not available, RNAME and
54// CIGAR are set as “*”
55//
56// A CIGAR string is comprised of a series of operation lengths plus the operations. The conventional CIGAR format allows
57// for three types of operations: M for match or mismatch, I for insertion and D for deletion. The extended CIGAR format
58// further allows four more operations, as is shown in the following table, to describe clipping, padding and splicing:
59//
60// op Description
61// -- -----------
62// M Match or mismatch
63// I Insertion to the reference
64// D Deletion from the reference
65// N Skipped region from the reference
66// S Soft clip on the read (clipped sequence present in <seq>)
67// H Hard clip on the read (clipped sequence NOT present in <seq>)
68// P Padding (silent deletion from the padded reference sequence)
69//
70
71
72
73
74////////////////////////////////////////////////////////////////////////
75///
76/// This class represents the CIGAR. It contains methods for converting
77/// to strings and extracting information from the cigar on how a read
78/// maps to the reference.
79///
80/// It only contains read only methods. There are no ways to set
81/// values. To set a value, a child class must be used.
82///
83class Cigar
84{
85public:
86 /// Enum for the cigar operations.
87 enum Operation {
88 none=0, ///< no operation has been set.
89 match, ///< match/mismatch operation. Associated with CIGAR Operation "M"
90 mismatch, ///< mismatch operation. Associated with CIGAR Operation "M"
91 insert, ///< insertion to the reference (the query sequence contains bases that have no corresponding base in the reference). Associated with CIGAR Operation "I"
92 del, ///< deletion from the reference (the reference contains bases that have no corresponding base in the query sequence). Associated with CIGAR Operation "D"
93 skip, ///< skipped region from the reference (the reference contains bases that have no corresponding base in the query sequence). Associated with CIGAR Operation "N"
94 softClip, ///< Soft clip on the read (clipped sequence present in the query sequence, but not in reference). Associated with CIGAR Operation "S"
95 hardClip, ///< Hard clip on the read (clipped sequence not present in the query sequence or reference). Associated with CIGAR Operation "H"
96 pad ///< Padding (not in reference or query). Associated with CIGAR Operation "P"
97 };
98
99 // The maximum value in the operation enum (used for setting up a bitset of
100 // operations.
101 static const int MAX_OP_VALUE = pad;
102
103 ////////////////////////////////////////////////////////////////////////
104 //
105 // Nested Struct : CigarOperator
106 //
108 {
109
111 {
112 operation = none;
113 count = 0;
114 }
115
116 /// Set the cigar operator with the specified operation and
117 /// count length.
118 CigarOperator(Operation operation, uint32_t count)
119 : operation(operation), count(count) {};
120
121 Operation operation;
122
123 uint32_t count;
124
125 /// Get the character code (M, I, D, N, S, H, or P) associated with
126 /// this operation.
127 char getChar() const
128 {
129 switch (operation)
130 {
131 case none:
132 return '?'; // error
133 case match:
134 case mismatch:
135 return'M';
136 case insert:
137 return 'I';
138 case del:
139 return'D';
140 case skip:
141 return 'N';
142 case softClip:
143 return 'S';
144 case hardClip:
145 return 'H';
146 case pad:
147 return 'P';
148 }
149 return '?'; // actually it is an error to get here
150 }
151
152 /// Compare only on the operator, true if they are the same, false if not. Match and mismatch are considered the same for CIGAR strings.
153 bool operator == (const CigarOperator &rhs) const
154 {
155 if (operation==rhs.operation)
156 return true;
157 if ((operation == mismatch || operation == match) && (rhs.operation == mismatch || rhs.operation == match))
158 return true;
159 return false;
160 }
161
162 /// Compare only on the operator, false if they are the same, true if not. Match and mismatch are considered the same for CIGAR strings.
163 bool operator != (const CigarOperator &rhs) const
164 {
165 return !((*this) == rhs) ;
166 }
167
168 };
169
170 ////////////////////////////////////////////////////////////////////////
171 //
172 // Cigar Class statics
173 //
174
175 /// Return true if the specified operation is found in the
176 /// reference sequence, false if not.
178 {
179 switch(op)
180 {
181 case match:
182 case mismatch:
183 case del:
184 case skip:
185 return true;
186 default:
187 return false;
188 }
189 return false;
190 }
191
192 /// Return true if the specified operation is found in the
193 /// reference sequence, false if not.
194 static bool foundInReference(char op)
195 {
196 switch(op)
197 {
198 case 'M':
199 case '=':
200 case 'X':
201 case 'D':
202 case 'N':
203 return true;
204 default:
205 return false;
206 }
207 return false;
208 }
209
210 /// Return true if the specified operation is found in the
211 /// reference sequence, false if not.
212 static bool foundInReference(const CigarOperator &op)
213 {
214 return(foundInReference(op.operation));
215 }
216
217 /// Return true if the specified operation is found in the
218 /// query sequence, false if not.
219 static bool foundInQuery(Operation op)
220 {
221 switch(op)
222 {
223 case match:
224 case mismatch:
225 case insert:
226 case softClip:
227 return true;
228 default:
229 return false;
230 }
231 return false;
232 }
233
234 /// Return true if the specified operation is found in the
235 /// query sequence, false if not.
236 static bool foundInQuery(char op)
237 {
238 switch(op)
239 {
240 case 'M':
241 case '=':
242 case 'X':
243 case 'I':
244 case 'S':
245 return true;
246 default:
247 return false;
248 }
249 return false;
250 }
251
252 /// Return true if the specified operation is found in the
253 /// query sequence, false if not.
254 static bool foundInQuery(const CigarOperator &op)
255 {
256 return(foundInQuery(op.operation));
257 }
258
259 /// Return true if the specified operation is a clipping operation,
260 /// false if not.
261 static bool isClip(Operation op)
262 {
263 switch(op)
264 {
265 case softClip:
266 case hardClip:
267 return true;
268 default:
269 return false;
270 }
271 return false;
272 }
273
274 /// Return true if the specified operation is a clipping operation,
275 /// false if not.
276 static bool isClip(char op)
277 {
278 switch(op)
279 {
280 case 'S':
281 case 'H':
282 return true;
283 default:
284 return false;
285 }
286 return false;
287 }
288
289 /// Return true if the specified operation is a clipping operation,
290 /// false if not.
291 static bool isClip(const CigarOperator &op)
292 {
293 return(isClip(op.operation));
294 }
295
296 /// Return true if the specified operation is a match/mismatch operation,
297 /// false if not.
299 {
300 switch(op)
301 {
302 case match:
303 case mismatch:
304 return true;
305 default:
306 return false;
307 }
308 return false;
309 }
310
311 /// Return true if the specified operation is a match/mismatch operation,
312 /// false if not.
313 static bool isMatchOrMismatch(const CigarOperator &op)
314 {
315 return(isMatchOrMismatch(op.operation));
316 }
317
318
319 ////////////////////////////////////////////////////////////////////////
320 //
321 // Cigar Class non static
322 //
323 friend std::ostream &operator << (std::ostream &stream, const Cigar& cigar);
324
325 /// Default constructor initializes as a CIGAR with no operations.
327 {
328 clearQueryAndReferenceIndexes();
329 }
330
331
332 /// Set the passed in String to the string reprentation of the Cigar
333 /// operations in this object.
334 void getCigarString(String& cigarString) const;
335
336 /// Set the passed in std::string to the string reprentation of the Cigar
337 /// operations in this object.
338 void getCigarString(std::string& cigarString) const;
339
340 /// Sets the specified string to a valid CIGAR string of characters that
341 /// represent the cigar with no digits (a CIGAR of "3M" would return "MMM").
342 /// The returned string is actually also a valid CIGAR string.
343 /// In theory this makes it easier to parse some reads.
344 /// \return s the string to populate
345 void getExpandedString(std::string &s) const;
346
347 /// Return the Cigar Operation at the specified index (starting at 0).
348 const CigarOperator & operator [](int i) const
349 {
350 return cigarOperations[i];
351 }
352
353 /// Return the Cigar Operation at the specified index (starting at 0).
354 const CigarOperator & getOperator(int i) const
355 {
356 return cigarOperations[i];
357 }
358
359 /// Return true if the 2 Cigars are the same
360 /// (the same operations of the same sizes).
361 bool operator == (Cigar &rhs) const;
362
363 /// Return the number of cigar operations
364 int size() const
365 {
366 return cigarOperations.size();
367 }
368
369 /// Write this object as a string to cout.
370 void Dump() const
371 {
372 String cigarString;
373 getCigarString(cigarString);
374 std::cout << cigarString ;
375 }
376
377 /// Return the length of the read that corresponds to
378 /// the current CIGAR string.
379 ///
380 /// For validation, we should expect that a sequence
381 /// read in a SAM file will be the same length as the
382 /// value returned by this method.
383 ///
384 /// Example: 3M2D3M describes a read with three bases
385 /// matching the reference, then skips 2 bases, then has
386 /// three more bases that match the reference (match/mismatch).
387 /// In this case, the read length is expected to be 6.
388 ///
389 /// Example: 3M2I3M describes a read with 3 match/mismatch
390 /// bases, two extra bases, and then 3 more match/mistmatch
391 /// bases. The total in this example is 8 bases.
392 ///
393 /// \return returns the expected read length
394 int getExpectedQueryBaseCount() const;
395
396 /// Return the number of bases in the reference that
397 /// this CIGAR "spans".
398 ///
399 /// When doing range checking, we occassionally need to know
400 /// how many total bases the CIGAR string represents as compared
401 /// to the reference.
402 ///
403 /// Examples: 3M2D3M describes a read that overlays 8 bases in
404 /// the reference. 3M2I3M describes a read with 3 bases that
405 /// match the reference, two additional bases that aren't in the
406 /// reference, and 3 more bases that match the reference, so it
407 /// spans 6 bases in the reference.
408 ///
409 /// \return how many bases in the reference are spanned
410 /// by the given CIGAR string
411 ///
413
414 /// Return the number of clips that are at the beginning of the cigar.
415 int getNumBeginClips() const;
416
417 /// Return the number of clips that are at the end of the cigar.
418 int getNumEndClips() const;
419
420 /// Return the reference offset associated with the specified
421 /// query index or INDEX_NA based on this cigar.
422 int32_t getRefOffset(int32_t queryIndex);
423
424 /// Return the query index associated with the specified
425 /// reference offset or INDEX_NA based on this cigar.
426 int32_t getQueryIndex(int32_t refOffset);
427
428 /// Return the reference position associated with the specified query index
429 /// or INDEX_NA based on this cigar and the specified queryStartPos which
430 /// is the leftmost mapping position of the first matching base in the
431 /// query.
432 int32_t getRefPosition(int32_t queryIndex, int32_t queryStartPos);
433
434 /// Return the query index or INDEX_NA associated with the specified
435 /// reference offset when the query starts at the specified reference
436 /// position.
437 int32_t getQueryIndex(int32_t refPosition, int32_t queryStartPos);
438
439 /// Returns the index into the expanded cigar for the cigar
440 /// associated with the specified queryIndex.
441 /// INDEX_NA returned if the index is out of range.
442 int32_t getExpandedCigarIndexFromQueryIndex(int32_t queryIndex);
443
444 /// Returns the index into the expanded cigar for the cigar
445 /// associated with the specified reference offset.
446 /// INDEX_NA returned if the offset is out of range.
447 int32_t getExpandedCigarIndexFromRefOffset(int32_t refOffset);
448
449 /// Returns the index into the expanded cigar for the cigar
450 /// associated with the specified reference position and queryStartPos.
451 /// INDEX_NA returned if the position is out of range.
452 int32_t getExpandedCigarIndexFromRefPos(int32_t refPosition,
453 int32_t queryStartPos);
454
455 /// Return the character code of the cigar operator associated with the
456 /// specified expanded CIGAR index. '?' is returned for an out of range
457 /// index.
458 char getCigarCharOp(int32_t expandedCigarIndex);
459
460 /// Return the character code of the cigar operator associated with
461 /// the specified queryIndex. '?' is returned for an out of range index.
462 char getCigarCharOpFromQueryIndex(int32_t queryIndex);
463
464 /// Return the character code of the cigar operator associated with
465 /// the specified reference offset. '?' is returned for an out of range offset.
466 char getCigarCharOpFromRefOffset(int32_t refOffset);
467
468 /// Return the character code of the cigar operator associated with
469 /// the specified reference position. '?' is returned for an out of
470 /// range reference position.
471 char getCigarCharOpFromRefPos(int32_t refPosition, int32_t queryStartPos);
472
473 /// Return the number of bases that overlap the reference and the
474 /// read associated with this cigar that falls within the specified region.
475 /// \param start : inclusive 0-based start position (reference position) of
476 /// the region to check for overlaps in
477 /// (-1 indicates to start at the beginning of the reference.)
478 /// \param end : exclusive 0-based end position (reference position) of the
479 /// region to check for overlaps in
480 /// (-1 indicates to go to the end of the reference.)
481 /// \param queryStartPos : 0-based leftmost mapping position of the first
482 /// matcihng base in the query.
483 uint32_t getNumOverlaps(int32_t start, int32_t end, int32_t queryStartPos);
484
485 /// Return whether or not the cigar has indels (insertions or delections)
486 /// \return true if it has an insertion or deletion, false if not.
487 bool hasIndel();
488
489 /// Value associated with an index that is not applicable/does not exist,
490 /// used for converting between query and reference indexes/offsets when
491 /// an associated index/offset does not exist.
492 static const int32_t INDEX_NA;
493
494protected:
495 // Clear the query index/reference offset index vectors.
496 void clearQueryAndReferenceIndexes();
497
498 // Set the query index/reference offset index vectors.
499 void setQueryAndReferenceIndexes();
500
501 // Container for the cigar operations in this cigar.
502 std::vector<CigarOperator> cigarOperations;
503
504private:
505 // The vector is indexed by query index and contains the reference
506 // offset associated with that query index.
507 // The vector is reset each time a new cigar operation is added, and
508 // is calculated when accessed if it is not already set.
509 std::vector<int32_t> queryToRef;
510
511 // The vector is indexed by reference offset and contains the query
512 // index associated with that reference offset.
513 // The vector is reset each time a new cigar operation is added, and
514 // is calculated when accessed if it is not already set.
515 std::vector<int32_t> refToQuery;
516
517 // The vector is indexed by reference offset and contains the offset into
518 // the expanded cigar associated with that reference offset.
519 // The vector is reset each time a new cigar operation is added, and
520 // is calculated when accessed if it is not already set.
521 std::vector<int32_t> refToCigar;
522
523 // The vector is indexed by query index and contains the offset into
524 // the expanded cigar associated with that query index.
525 // The vector is reset each time a new cigar operation is added, and
526 // is calculated when accessed if it is not already set.
527 std::vector<int32_t> queryToCigar;
528
529 std::string myExpandedCigar;
530};
531
532/// Writes the specified cigar operation to the specified stream as <count><char> (3M).
533inline std::ostream &operator << (std::ostream &stream, const Cigar::CigarOperator& o)
534{
535 stream << o.count << o.getChar();
536 return stream;
537}
538
539/// Writes all of the cigar operations contained in the cigar to the passed in stream.
540inline std::ostream &operator << (std::ostream &stream, const Cigar& cigar)
541{
542 stream << cigar.cigarOperations;
543 return stream;
544}
545
546#endif
InputFile & operator<<(InputFile &stream, const std::string &str)
Write to a file using streaming.
Definition: InputFile.h:736
This class represents the CIGAR without any methods to set the cigar (see CigarRoller for that).
Definition: Cigar.h:84
int size() const
Return the number of cigar operations.
Definition: Cigar.h:364
int getExpectedQueryBaseCount() const
Return the length of the read that corresponds to the current CIGAR string.
Definition: Cigar.cpp:95
static const int32_t INDEX_NA
Value associated with an index that is not applicable/does not exist, used for converting between que...
Definition: Cigar.h:492
char getCigarCharOp(int32_t expandedCigarIndex)
Return the character code of the cigar operator associated with the specified expanded CIGAR index.
Definition: Cigar.cpp:295
static bool foundInReference(Operation op)
Return true if the specified operation is found in the reference sequence, false if not.
Definition: Cigar.h:177
friend std::ostream & operator<<(std::ostream &stream, const Cigar &cigar)
Writes all of the cigar operations contained in the cigar to the passed in stream.
Definition: Cigar.h:540
char getCigarCharOpFromQueryIndex(int32_t queryIndex)
Return the character code of the cigar operator associated with the specified queryIndex.
Definition: Cigar.cpp:314
static bool foundInQuery(const CigarOperator &op)
Return true if the specified operation is found in the query sequence, false if not.
Definition: Cigar.h:254
Cigar()
Default constructor initializes as a CIGAR with no operations.
Definition: Cigar.h:326
int32_t getExpandedCigarIndexFromQueryIndex(int32_t queryIndex)
Returns the index into the expanded cigar for the cigar associated with the specified queryIndex.
Definition: Cigar.cpp:258
static bool foundInQuery(char op)
Return true if the specified operation is found in the query sequence, false if not.
Definition: Cigar.h:236
static bool isMatchOrMismatch(Operation op)
Return true if the specified operation is a match/mismatch operation, false if not.
Definition: Cigar.h:298
static bool foundInQuery(Operation op)
Return true if the specified operation is found in the query sequence, false if not.
Definition: Cigar.h:219
static bool foundInReference(const CigarOperator &op)
Return true if the specified operation is found in the reference sequence, false if not.
Definition: Cigar.h:212
static bool isMatchOrMismatch(const CigarOperator &op)
Return true if the specified operation is a match/mismatch operation, false if not.
Definition: Cigar.h:313
int32_t getExpandedCigarIndexFromRefPos(int32_t refPosition, int32_t queryStartPos)
Returns the index into the expanded cigar for the cigar associated with the specified reference posit...
Definition: Cigar.cpp:288
void getExpandedString(std::string &s) const
Sets the specified string to a valid CIGAR string of characters that represent the cigar with no digi...
Definition: Cigar.cpp:63
int getNumBeginClips() const
Return the number of clips that are at the beginning of the cigar.
Definition: Cigar.cpp:144
bool hasIndel()
Return whether or not the cigar has indels (insertions or delections)
Definition: Cigar.cpp:398
int getNumEndClips() const
Return the number of clips that are at the end of the cigar.
Definition: Cigar.cpp:166
char getCigarCharOpFromRefOffset(int32_t refOffset)
Return the character code of the cigar operator associated with the specified reference offset.
Definition: Cigar.cpp:320
int getExpectedReferenceBaseCount() const
Return the number of bases in the reference that this CIGAR "spans".
Definition: Cigar.cpp:120
int32_t getRefPosition(int32_t queryIndex, int32_t queryStartPos)
Return the reference position associated with the specified query index or INDEX_NA based on this cig...
Definition: Cigar.cpp:217
int32_t getExpandedCigarIndexFromRefOffset(int32_t refOffset)
Returns the index into the expanded cigar for the cigar associated with the specified reference offse...
Definition: Cigar.cpp:273
static bool isClip(char op)
Return true if the specified operation is a clipping operation, false if not.
Definition: Cigar.h:276
Operation
Enum for the cigar operations.
Definition: Cigar.h:87
@ del
deletion from the reference (the reference contains bases that have no corresponding base in the quer...
Definition: Cigar.h:92
@ mismatch
mismatch operation. Associated with CIGAR Operation "M"
Definition: Cigar.h:90
@ hardClip
Hard clip on the read (clipped sequence not present in the query sequence or reference)....
Definition: Cigar.h:95
@ match
match/mismatch operation. Associated with CIGAR Operation "M"
Definition: Cigar.h:89
@ pad
Padding (not in reference or query). Associated with CIGAR Operation "P".
Definition: Cigar.h:96
@ insert
insertion to the reference (the query sequence contains bases that have no corresponding base in the ...
Definition: Cigar.h:91
@ skip
skipped region from the reference (the reference contains bases that have no corresponding base in th...
Definition: Cigar.h:93
@ softClip
Soft clip on the read (clipped sequence present in the query sequence, but not in reference)....
Definition: Cigar.h:94
@ none
no operation has been set.
Definition: Cigar.h:88
int32_t getRefOffset(int32_t queryIndex)
Return the reference offset associated with the specified query index or INDEX_NA based on this cigar...
Definition: Cigar.cpp:187
void Dump() const
Write this object as a string to cout.
Definition: Cigar.h:370
const CigarOperator & getOperator(int i) const
Return the Cigar Operation at the specified index (starting at 0).
Definition: Cigar.h:354
static bool isClip(const CigarOperator &op)
Return true if the specified operation is a clipping operation, false if not.
Definition: Cigar.h:291
static bool foundInReference(char op)
Return true if the specified operation is found in the reference sequence, false if not.
Definition: Cigar.h:194
bool operator==(Cigar &rhs) const
Return true if the 2 Cigars are the same (the same operations of the same sizes).
Definition: Cigar.cpp:80
uint32_t getNumOverlaps(int32_t start, int32_t end, int32_t queryStartPos)
Return the number of bases that overlap the reference and the read associated with this cigar that fa...
Definition: Cigar.cpp:334
char getCigarCharOpFromRefPos(int32_t refPosition, int32_t queryStartPos)
Return the character code of the cigar operator associated with the specified reference position.
Definition: Cigar.cpp:326
const CigarOperator & operator[](int i) const
Return the Cigar Operation at the specified index (starting at 0).
Definition: Cigar.h:348
void getCigarString(String &cigarString) const
Set the passed in String to the string reprentation of the Cigar operations in this object.
Definition: Cigar.cpp:52
int32_t getQueryIndex(int32_t refOffset)
Return the query index associated with the specified reference offset or INDEX_NA based on this cigar...
Definition: Cigar.cpp:202
static bool isClip(Operation op)
Return true if the specified operation is a clipping operation, false if not.
Definition: Cigar.h:261
CigarOperator(Operation operation, uint32_t count)
Set the cigar operator with the specified operation and count length.
Definition: Cigar.h:118
bool operator!=(const CigarOperator &rhs) const
Compare only on the operator, false if they are the same, true if not. Match and mismatch are conside...
Definition: Cigar.h:163
bool operator==(const CigarOperator &rhs) const
Compare only on the operator, true if they are the same, false if not. Match and mismatch are conside...
Definition: Cigar.h:153
char getChar() const
Get the character code (M, I, D, N, S, H, or P) associated with this operation.
Definition: Cigar.h:127