diff --git a/src/annotateBed/annotateBed.cpp b/src/annotateBed/annotateBed.cpp index 10477d36e4cec1a2b672673645c5ff6da4ea147b..9dcf45ac6db5d6718b92d27655f40a6fe725a561 100644 --- a/src/annotateBed/annotateBed.cpp +++ b/src/annotateBed/annotateBed.cpp @@ -13,7 +13,7 @@ #include "annotateBed.h" // build -BedAnnotate::BedAnnotate(const string &mainFile, const vector<string> &annoFileNames, +BedAnnotate::BedAnnotate(const string &mainFile, const vector<string> &annoFileNames, const vector<string> &annoTitles, bool forceStrand, bool reportCounts, bool reportBoth) : _mainFile(mainFile), @@ -24,36 +24,36 @@ BedAnnotate::BedAnnotate(const string &mainFile, const vector<string> &annoFileN _reportBoth(reportBoth) { _bed = new BedFile(_mainFile); -} +} // destroy and delete the open file pointers BedAnnotate::~BedAnnotate(void) { - delete _bed; + delete _bed; CloseAnnoFiles(); } void BedAnnotate::OpenAnnoFiles() { - for (size_t i=0; i < _annoFileNames.size(); ++i) { - BedFile *file = new BedFile(_annoFileNames[i]); - file->Open(); - _annoFiles.push_back(file); - } + for (size_t i=0; i < _annoFileNames.size(); ++i) { + BedFile *file = new BedFile(_annoFileNames[i]); + file->Open(); + _annoFiles.push_back(file); + } } void BedAnnotate::CloseAnnoFiles() { - for (size_t i=0; i < _annoFiles.size(); ++i) { - BedFile *file = _annoFiles[i]; - delete file; - _annoFiles[i] = NULL; - } + for (size_t i=0; i < _annoFiles.size(); ++i) { + BedFile *file = _annoFiles[i]; + delete file; + _annoFiles[i] = NULL; + } } void BedAnnotate::PrintHeader() { - // print a hash to indicate header and then write a tab + // print a hash to indicate header and then write a tab // for each field in the main file. printf("#"); for (size_t i = 0; i < _bed->bedType; ++i) @@ -74,135 +74,135 @@ void BedAnnotate::PrintHeader() { void BedAnnotate::InitializeMainFile() { - // process each chromosome - masterBedCovListMap::iterator chromItr = _bed->bedCovListMap.begin(); - masterBedCovListMap::iterator chromEnd = _bed->bedCovListMap.end(); - for (; chromItr != chromEnd; ++chromItr) { - // for each chrom, process each bin - binsToBedCovLists::iterator binItr = chromItr->second.begin(); - binsToBedCovLists::iterator binEnd = chromItr->second.end(); - for (; binItr != binEnd; ++binItr) { + // process each chromosome + masterBedCovListMap::iterator chromItr = _bed->bedCovListMap.begin(); + masterBedCovListMap::iterator chromEnd = _bed->bedCovListMap.end(); + for (; chromItr != chromEnd; ++chromItr) { + // for each chrom, process each bin + binsToBedCovLists::iterator binItr = chromItr->second.begin(); + binsToBedCovLists::iterator binEnd = chromItr->second.end(); + for (; binItr != binEnd; ++binItr) { // initialize BEDCOVLIST in this chrom/bin - vector<BEDCOVLIST>::iterator bedItr = binItr->second.begin(); - vector<BEDCOVLIST>::iterator bedEnd = binItr->second.end(); - for (; bedItr != bedEnd; ++bedItr) { - // initialize the depthMaps, counts, etc. for each anno file. + vector<BEDCOVLIST>::iterator bedItr = binItr->second.begin(); + vector<BEDCOVLIST>::iterator bedEnd = binItr->second.end(); + for (; bedItr != bedEnd; ++bedItr) { + // initialize the depthMaps, counts, etc. for each anno file. for (size_t i = 0; i < _annoFiles.size(); ++i) { map<unsigned int, DEPTH> dummy; bedItr->depthMapList.push_back(dummy); bedItr->counts.push_back(0); bedItr->minOverlapStarts.push_back(INT_MAX); } - } - } + } + } } } void BedAnnotate::AnnotateBed() { - - // load the "main" bed file into a map so - // that we can easily compare each annoFile to it for overlaps - _bed->loadBedCovListFileIntoMap(); + + // load the "main" bed file into a map so + // that we can easily compare each annoFile to it for overlaps + _bed->loadBedCovListFileIntoMap(); // open the annotations files for processing; - OpenAnnoFiles(); + OpenAnnoFiles(); // initialize counters, depths, etc. for the main file InitializeMainFile(); - - // annotate the main file with the coverage from the annotation files. + + // annotate the main file with the coverage from the annotation files. for (size_t annoIndex = 0; annoIndex < _annoFiles.size(); ++annoIndex) { // grab the current annotation file. - BedFile *anno = _annoFiles[annoIndex]; + BedFile *anno = _annoFiles[annoIndex]; int lineNum = 0; - BED a, nullBed; - BedLineStatus bedStatus; - - // process each entry in the current anno file - while ((bedStatus = anno->GetNextBed(a, lineNum)) != BED_INVALID) { - if (bedStatus == BED_VALID) { - _bed->countListHits(a, annoIndex, _forceStrand); - a = nullBed; - } - } - } - - // report the annotations of the main file from the anno file. - ReportAnnotations(); - // close the annotations files; - CloseAnnoFiles(); + BED a, nullBed; + BedLineStatus bedStatus; + + // process each entry in the current anno file + while ((bedStatus = anno->GetNextBed(a, lineNum)) != BED_INVALID) { + if (bedStatus == BED_VALID) { + _bed->countListHits(a, annoIndex, _forceStrand); + a = nullBed; + } + } + } + + // report the annotations of the main file from the anno file. + ReportAnnotations(); + // close the annotations files; + CloseAnnoFiles(); } void BedAnnotate::ReportAnnotations() { - + if (_annoTitles.size() > 0) { PrintHeader(); } - // process each chromosome - masterBedCovListMap::const_iterator chromItr = _bed->bedCovListMap.begin(); - masterBedCovListMap::const_iterator chromEnd = _bed->bedCovListMap.end(); - for (; chromItr != chromEnd; ++chromItr) { - // for each chrom, process each bin - binsToBedCovLists::const_iterator binItr = chromItr->second.begin(); - binsToBedCovLists::const_iterator binEnd = chromItr->second.end(); - for (; binItr != binEnd; ++binItr) { - // for each chrom & bin, compute and report + // process each chromosome + masterBedCovListMap::const_iterator chromItr = _bed->bedCovListMap.begin(); + masterBedCovListMap::const_iterator chromEnd = _bed->bedCovListMap.end(); + for (; chromItr != chromEnd; ++chromItr) { + // for each chrom, process each bin + binsToBedCovLists::const_iterator binItr = chromItr->second.begin(); + binsToBedCovLists::const_iterator binEnd = chromItr->second.end(); + for (; binItr != binEnd; ++binItr) { + // for each chrom & bin, compute and report // the observed coverage for each feature - vector<BEDCOVLIST>::const_iterator bedItr = binItr->second.begin(); - vector<BEDCOVLIST>::const_iterator bedEnd = binItr->second.end(); - for (; bedItr != bedEnd; ++bedItr) { - // print the main BED entry. - _bed->reportBedTab(*bedItr); - + vector<BEDCOVLIST>::const_iterator bedItr = binItr->second.begin(); + vector<BEDCOVLIST>::const_iterator bedEnd = binItr->second.end(); + for (; bedItr != bedEnd; ++bedItr) { + // print the main BED entry. + _bed->reportBedTab(*bedItr); + // now report the coverage from each annotation file. for (size_t i = 0; i < _annoFiles.size(); ++i) { - unsigned int totalLength = 0; - int zeroDepthCount = 0; // number of bases with zero depth - int depth = 0; // tracks the depth at the current base - - // the start is either the first base in the feature OR - // the leftmost position of an overlapping feature. e.g. (s = start): - // A ---------- - // B s ------------ - int start = min(bedItr->minOverlapStarts[i], bedItr->start); - - map<unsigned int, DEPTH>::const_iterator depthItr; - map<unsigned int, DEPTH>::const_iterator depthEnd; - - // compute the coverage observed at each base in the feature marching from start to end. - for (CHRPOS pos = start+1; pos <= bedItr->end; pos++) { - // map pointer grabbing the starts and ends observed at this position - depthItr = bedItr->depthMapList[i].find(pos); + unsigned int totalLength = 0; + int zeroDepthCount = 0; // number of bases with zero depth + int depth = 0; // tracks the depth at the current base + + // the start is either the first base in the feature OR + // the leftmost position of an overlapping feature. e.g. (s = start): + // A ---------- + // B s ------------ + int start = min(bedItr->minOverlapStarts[i], bedItr->start); + + map<unsigned int, DEPTH>::const_iterator depthItr; + map<unsigned int, DEPTH>::const_iterator depthEnd; + + // compute the coverage observed at each base in the feature marching from start to end. + for (CHRPOS pos = start+1; pos <= bedItr->end; pos++) { + // map pointer grabbing the starts and ends observed at this position + depthItr = bedItr->depthMapList[i].find(pos); depthEnd = bedItr->depthMapList[i].end(); - - // increment coverage if starts observed at this position. - if (depthItr != depthEnd) - depth += depthItr->second.starts; - // update zero depth - if ((pos > bedItr->start) && (pos <= bedItr->end) && (depth == 0)) - zeroDepthCount++; - // decrement coverage if ends observed at this position. - if (depthItr != depthEnd) - depth = depth - depthItr->second.ends; - } - // Summarize the coverage for the current interval, - CHRPOS length = bedItr->end - bedItr->start; - totalLength += length; - int nonZeroBases = (length - zeroDepthCount); - float fractCovered = (float) nonZeroBases / length; - if (_reportCounts == false && _reportBoth == false) - printf("%f\t", fractCovered); - else if (_reportCounts == true && _reportBoth == false) - printf("%d\t", bedItr->counts[i]); + + // increment coverage if starts observed at this position. + if (depthItr != depthEnd) + depth += depthItr->second.starts; + // update zero depth + if ((pos > bedItr->start) && (pos <= bedItr->end) && (depth == 0)) + zeroDepthCount++; + // decrement coverage if ends observed at this position. + if (depthItr != depthEnd) + depth = depth - depthItr->second.ends; + } + // Summarize the coverage for the current interval, + CHRPOS length = bedItr->end - bedItr->start; + totalLength += length; + int nonZeroBases = (length - zeroDepthCount); + float fractCovered = (float) nonZeroBases / length; + if (_reportCounts == false && _reportBoth == false) + printf("%f\t", fractCovered); + else if (_reportCounts == true && _reportBoth == false) + printf("%d\t", bedItr->counts[i]); else if (_reportCounts == false && _reportBoth == true) - printf("%d\t%f\t", bedItr->counts[i], fractCovered); - } - // print newline for next feature. + printf("%d\t%f\t", bedItr->counts[i], fractCovered); + } + // print newline for next feature. printf("\n"); - } - } - } + } + } + } } diff --git a/src/annotateBed/annotateBed.h b/src/annotateBed/annotateBed.h index fe02ecb3dfe782f63f1db65c9137aef1861b410e..1928e61e37f6dd66cc079a07bc78b6f052beafb1 100644 --- a/src/annotateBed/annotateBed.h +++ b/src/annotateBed/annotateBed.h @@ -9,7 +9,7 @@ Licenced under the GNU General Public License 2.0 license. ******************************************************************************/ -#ifndef ANNOTATEBED_H +#ifndef ANNOTATEBED_H #define ANNOTATEBED_H #include "bedFile.h" @@ -35,41 +35,41 @@ class BedAnnotate { public: - // constructor - BedAnnotate(const string &mainFile, const vector<string> &annoFileNames, - const vector<string> &annoTitles, bool forceStrand, bool reportCounts, bool reportBoth); + // constructor + BedAnnotate(const string &mainFile, const vector<string> &annoFileNames, + const vector<string> &annoTitles, bool forceStrand, bool reportCounts, bool reportBoth); + + // destructor + ~BedAnnotate(void); + + // annotate the master file with all of the annotation files. + void AnnotateBed(); - // destructor - ~BedAnnotate(void); - - // annotate the master file with all of the annotation files. - void AnnotateBed(); - private: - // input files. - string _mainFile; + // input files. + string _mainFile; vector<string> _annoFileNames; vector<string> _annoTitles; - - // instance of a bed file class. + + // instance of a bed file class. BedFile *_bed; vector<BedFile*> _annoFiles; - - // do we care about strandedness when counting coverage? - bool _forceStrand; + + // do we care about strandedness when counting coverage? + bool _forceStrand; bool _reportCounts; bool _reportBoth; - - // private function for reporting coverage information - void ReportAnnotations(); - + + // private function for reporting coverage information + void ReportAnnotations(); + void OpenAnnoFiles(); - + void CloseAnnoFiles(); - + void PrintHeader(); - + void InitializeMainFile(); }; #endif /* ANNOTATEBED_H */ diff --git a/src/annotateBed/annotateMain.cpp b/src/annotateBed/annotateMain.cpp index 65a440709137f52a7c6ce126ceda16f2883f6e33..eb094409d76d8d665de4943eafc97ba2368fde92 100644 --- a/src/annotateBed/annotateMain.cpp +++ b/src/annotateBed/annotateMain.cpp @@ -25,135 +25,135 @@ void ShowHelp(void); int main(int argc, char* argv[]) { - // our configuration variables - bool showHelp = false; - - // input file - string mainFile; - - // parm flags - bool forceStrand = false; - bool haveBed = false; - bool haveFiles = false; - bool haveTitles = false; + // our configuration variables + bool showHelp = false; + + // input file + string mainFile; + + // parm flags + bool forceStrand = false; + bool haveBed = false; + bool haveFiles = false; + bool haveTitles = false; bool reportCounts = false; - bool reportBoth = false; + bool reportBoth = false; // list of annotation files / names - vector<string> inputFiles; - vector<string> inputTitles; - - // check to see if we should print out some help - if(argc <= 1) showHelp = true; - - for(int i = 1; i < argc; i++) { - int parameterLength = (int)strlen(argv[i]); - - if((PARAMETER_CHECK("-h", 2, parameterLength)) || - (PARAMETER_CHECK("--help", 5, parameterLength))) { - showHelp = true; - } - } - - if(showHelp) ShowHelp(); - - // do some parsing (all of these parameters require 2 strings) - for(int i = 1; i < argc; i++) { - - int parameterLength = (int)strlen(argv[i]); - - if(PARAMETER_CHECK("-i", 2, parameterLength)) { - if ((i+1) < argc) { - haveBed = true; - mainFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-files", 6, parameterLength)) { - if ((i+1) < argc) { - haveFiles = true; + vector<string> inputFiles; + vector<string> inputTitles; + + // check to see if we should print out some help + if(argc <= 1) showHelp = true; + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if((PARAMETER_CHECK("-h", 2, parameterLength)) || + (PARAMETER_CHECK("--help", 5, parameterLength))) { + showHelp = true; + } + } + + if(showHelp) ShowHelp(); + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-i", 2, parameterLength)) { + if ((i+1) < argc) { + haveBed = true; + mainFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-files", 6, parameterLength)) { + if ((i+1) < argc) { + haveFiles = true; i = i+1; string file = argv[i]; - while (file[0] != '-' && i < argc) { + while (file[0] != '-' && i < argc) { inputFiles.push_back(file); i++; if (i < argc) file = argv[i]; - } + } i--; - } - } - else if(PARAMETER_CHECK("-names", 6, parameterLength)) { - if ((i+1) < argc) { - haveTitles = true; + } + } + else if(PARAMETER_CHECK("-names", 6, parameterLength)) { + if ((i+1) < argc) { + haveTitles = true; i = i+1; string title = argv[i]; - while (title[0] != '-' && i < argc) { + while (title[0] != '-' && i < argc) { inputTitles.push_back(title); i++; if (i < argc) title = argv[i]; - } + } i--; - } - } - else if(PARAMETER_CHECK("-counts", 7, parameterLength)) { - reportCounts = true; - } - else if(PARAMETER_CHECK("-both", 5, parameterLength)) { - reportBoth = true; - } - else if (PARAMETER_CHECK("-s", 2, parameterLength)) { - forceStrand = true; - } - else { - cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; - showHelp = true; - } - } - - // make sure we have both input files - if (!haveBed || !haveFiles) { - cerr << endl << "*****" << endl << "*****ERROR: Need -i and -files files. " << endl << "*****" << endl; - showHelp = true; - } - - if (!showHelp) { - BedAnnotate *ba = new BedAnnotate(mainFile, inputFiles, inputTitles, forceStrand, reportCounts, reportBoth); + } + } + else if(PARAMETER_CHECK("-counts", 7, parameterLength)) { + reportCounts = true; + } + else if(PARAMETER_CHECK("-both", 5, parameterLength)) { + reportBoth = true; + } + else if (PARAMETER_CHECK("-s", 2, parameterLength)) { + forceStrand = true; + } + else { + cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; + showHelp = true; + } + } + + // make sure we have both input files + if (!haveBed || !haveFiles) { + cerr << endl << "*****" << endl << "*****ERROR: Need -i and -files files. " << endl << "*****" << endl; + showHelp = true; + } + + if (!showHelp) { + BedAnnotate *ba = new BedAnnotate(mainFile, inputFiles, inputTitles, forceStrand, reportCounts, reportBoth); ba->AnnotateBed(); - delete ba; - return 0; - } - else { - ShowHelp(); - } + delete ba; + return 0; + } + else { + ShowHelp(); + } } void ShowHelp(void) { - - cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; - - cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; - - cerr << "Summary: Annotates the depth & breadth of coverage of features from multiple files" << endl; - cerr << "\t on the intervals in -i." << endl << endl; - - cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bed/gff/vcf> -files FILE1 FILE2 .. FILEn" << endl << endl; - - cerr << "Options: " << endl; - - cerr << "\t-names\t" << "A list of names (one / file) to describe each file in -i." << endl; - cerr << "\t\tThese names will be printed as a header line." << endl << endl; - - cerr << "\t-counts\t" << "Report the count of features in each file that overlap -i." << endl; - cerr << "\t\t- Default is to report the fraction of -i covered by each file." << endl << endl; - - cerr << "\t-both\t" << "Report the counts followed by the % coverage." << endl; - cerr << "\t\t- Default is to report the fraction of -i covered by each file." << endl << endl; - - cerr << "\t-s\t" << "Force strandedness. That is, only include hits in A that" << endl; - cerr << "\t\toverlap B on the same strand." << endl; - cerr << "\t\t- By default, hits are included without respect to strand." << endl << endl; - - exit(1); + + cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; + + cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; + + cerr << "Summary: Annotates the depth & breadth of coverage of features from multiple files" << endl; + cerr << "\t on the intervals in -i." << endl << endl; + + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bed/gff/vcf> -files FILE1 FILE2 .. FILEn" << endl << endl; + + cerr << "Options: " << endl; + + cerr << "\t-names\t" << "A list of names (one / file) to describe each file in -i." << endl; + cerr << "\t\tThese names will be printed as a header line." << endl << endl; + + cerr << "\t-counts\t" << "Report the count of features in each file that overlap -i." << endl; + cerr << "\t\t- Default is to report the fraction of -i covered by each file." << endl << endl; + + cerr << "\t-both\t" << "Report the counts followed by the % coverage." << endl; + cerr << "\t\t- Default is to report the fraction of -i covered by each file." << endl << endl; + + cerr << "\t-s\t" << "Force strandedness. That is, only include hits in A that" << endl; + cerr << "\t\toverlap B on the same strand." << endl; + cerr << "\t\t- By default, hits are included without respect to strand." << endl << endl; + + exit(1); } diff --git a/src/bamToBed/bamToBed.cpp b/src/bamToBed/bamToBed.cpp index 7f46b466d12318d358e3b15c79e80ffea7885282..fa042fe9382b04b4466862712054f7e974a4de21 100644 --- a/src/bamToBed/bamToBed.cpp +++ b/src/bamToBed/bamToBed.cpp @@ -17,7 +17,7 @@ using namespace BamTools; #include <vector> -#include <algorithm> // for swap() +#include <algorithm> // for swap() #include <iostream> #include <fstream> #include <stdlib.h> @@ -38,392 +38,392 @@ void ShowHelp(void); void ConvertBamToBed(const string &bamFile, const bool &useEditDistance, const string &bamTag, const bool &writeBed12, const bool &obeySplits, const string &color, const bool &useCigar); void ConvertBamToBedpe(const string &bamFile, const bool &useEditDistance); - + void PrintBed(const BamAlignment &bam, const RefVector &refs, bool useEditDistance, const string &bamTag, bool obeySplits, bool useCigar); void PrintBed12(const BamAlignment &bam, const RefVector &refs, bool useEditDistance, const string &bamTag, string color = "255,0,0"); void PrintBedPE(const BamAlignment &bam1, const BamAlignment &bam2, const RefVector &refs, bool useEditDistance); -void ParseCigarBed12(const vector<CigarOp> &cigar, vector<int> &blockStarts, +void ParseCigarBed12(const vector<CigarOp> &cigar, vector<int> &blockStarts, vector<int> &blockEnds, int &alignmentEnd); string BuildCigarString(const vector<CigarOp> &cigar); - + bool IsCorrectMappingForBEDPE (const BamAlignment &bam); int main(int argc, char* argv[]) { - // our configuration variables - bool showHelp = false; + // our configuration variables + bool showHelp = false; - // input files - string bamFile = "stdin"; - string color = "255,0,0"; + // input files + string bamFile = "stdin"; + string color = "255,0,0"; string tag = ""; - - bool haveBam = true; - bool haveColor = false; - bool haveOtherTag = false; - bool writeBedPE = false; - bool writeBed12 = false; - bool useEditDistance = false; - bool useAlignmentScore = false; + + bool haveBam = true; + bool haveColor = false; + bool haveOtherTag = false; + bool writeBedPE = false; + bool writeBed12 = false; + bool useEditDistance = false; + bool useAlignmentScore = false; bool useCigar = false; - bool obeySplits = false; - - // check to see if we should print out some help - - for(int i = 1; i < argc; i++) { - int parameterLength = (int)strlen(argv[i]); - - if((PARAMETER_CHECK("-h", 2, parameterLength)) || - (PARAMETER_CHECK("--help", 5, parameterLength))) { - showHelp = true; - } - } - - if(showHelp) ShowHelp(); - - // do some parsing (all of these parameters require 2 strings) - for(int i = 1; i < argc; i++) { - - int parameterLength = (int)strlen(argv[i]); - - if(PARAMETER_CHECK("-i", 2, parameterLength)) { - if ((i+1) < argc) { - bamFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-bedpe", 6, parameterLength)) { - writeBedPE = true; - } - else if(PARAMETER_CHECK("-bed12", 6, parameterLength)) { - writeBed12 = true; - } - else if(PARAMETER_CHECK("-split", 6, parameterLength)) { - obeySplits = true; - } - else if(PARAMETER_CHECK("-ed", 3, parameterLength)) { - useEditDistance = true; - } - else if(PARAMETER_CHECK("-cigar", 6, parameterLength)) { - useCigar = true; - } - else if(PARAMETER_CHECK("-as", 3, parameterLength)) { - useAlignmentScore = true; - } - else if(PARAMETER_CHECK("-color", 6, parameterLength)) { - if ((i+1) < argc) { - haveColor = true; - color = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-tag", 4, parameterLength)) { - if ((i+1) < argc) { - haveOtherTag = true; - tag = argv[i + 1]; - i++; - } - } - else { - cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; - showHelp = true; - } - } - - // make sure we have an input files - if (haveBam == false) { - cerr << endl << "*****" << endl << "*****ERROR: Need -i (BAM) file. " << endl << "*****" << endl; - showHelp = true; - } - if (haveColor == true && writeBed12 == false) { - cerr << endl << "*****" << endl << "*****ERROR: Cannot use color without BED12. " << endl << "*****" << endl; - showHelp = true; - } - if (useEditDistance == true && obeySplits == true) { - cerr << endl << "*****" << endl << "*****ERROR: Cannot use -ed with -splits. Edit distances cannot be computed for each \'chunk\'." << endl << "*****" << endl; - showHelp = true; - } - if (useEditDistance == true && useCigar == true) { - cerr << endl << "*****" << endl << "*****ERROR: Cannot use -cigar with -splits. Not yet supported." << endl << "*****" << endl; - showHelp = true; - } - if (useEditDistance == true && haveOtherTag == true) { - cerr << endl << "*****" << endl << "*****ERROR: Cannot use -ed with -tag. Choose one or the other." << endl << "*****" << endl; - showHelp = true; - } - if (writeBedPE == true && haveOtherTag == true) { - cerr << endl << "*****" << endl << "*****ERROR: Cannot use -tag with -bedpe." << endl << "*****" << endl; - showHelp = true; - } - // if there are no problems, let's convert BAM to BED or BEDPE - if (!showHelp) { - if (writeBedPE == false) - ConvertBamToBed(bamFile, useEditDistance, tag, writeBed12, obeySplits, color, useCigar); // BED or "blocked BED" - else - ConvertBamToBedpe(bamFile, useEditDistance); // BEDPE - } - else { - ShowHelp(); - } + bool obeySplits = false; + + // check to see if we should print out some help + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if((PARAMETER_CHECK("-h", 2, parameterLength)) || + (PARAMETER_CHECK("--help", 5, parameterLength))) { + showHelp = true; + } + } + + if(showHelp) ShowHelp(); + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-i", 2, parameterLength)) { + if ((i+1) < argc) { + bamFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-bedpe", 6, parameterLength)) { + writeBedPE = true; + } + else if(PARAMETER_CHECK("-bed12", 6, parameterLength)) { + writeBed12 = true; + } + else if(PARAMETER_CHECK("-split", 6, parameterLength)) { + obeySplits = true; + } + else if(PARAMETER_CHECK("-ed", 3, parameterLength)) { + useEditDistance = true; + } + else if(PARAMETER_CHECK("-cigar", 6, parameterLength)) { + useCigar = true; + } + else if(PARAMETER_CHECK("-as", 3, parameterLength)) { + useAlignmentScore = true; + } + else if(PARAMETER_CHECK("-color", 6, parameterLength)) { + if ((i+1) < argc) { + haveColor = true; + color = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-tag", 4, parameterLength)) { + if ((i+1) < argc) { + haveOtherTag = true; + tag = argv[i + 1]; + i++; + } + } + else { + cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; + showHelp = true; + } + } + + // make sure we have an input files + if (haveBam == false) { + cerr << endl << "*****" << endl << "*****ERROR: Need -i (BAM) file. " << endl << "*****" << endl; + showHelp = true; + } + if (haveColor == true && writeBed12 == false) { + cerr << endl << "*****" << endl << "*****ERROR: Cannot use color without BED12. " << endl << "*****" << endl; + showHelp = true; + } + if (useEditDistance == true && obeySplits == true) { + cerr << endl << "*****" << endl << "*****ERROR: Cannot use -ed with -splits. Edit distances cannot be computed for each \'chunk\'." << endl << "*****" << endl; + showHelp = true; + } + if (useEditDistance == true && useCigar == true) { + cerr << endl << "*****" << endl << "*****ERROR: Cannot use -cigar with -splits. Not yet supported." << endl << "*****" << endl; + showHelp = true; + } + if (useEditDistance == true && haveOtherTag == true) { + cerr << endl << "*****" << endl << "*****ERROR: Cannot use -ed with -tag. Choose one or the other." << endl << "*****" << endl; + showHelp = true; + } + if (writeBedPE == true && haveOtherTag == true) { + cerr << endl << "*****" << endl << "*****ERROR: Cannot use -tag with -bedpe." << endl << "*****" << endl; + showHelp = true; + } + // if there are no problems, let's convert BAM to BED or BEDPE + if (!showHelp) { + if (writeBedPE == false) + ConvertBamToBed(bamFile, useEditDistance, tag, writeBed12, obeySplits, color, useCigar); // BED or "blocked BED" + else + ConvertBamToBedpe(bamFile, useEditDistance); // BEDPE + } + else { + ShowHelp(); + } } void ShowHelp(void) { - cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; - - cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; + cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; + + cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; - cerr << "Summary: Converts BAM alignments to BED6 or BEDPE format." << endl << endl; + cerr << "Summary: Converts BAM alignments to BED6 or BEDPE format." << endl << endl; - cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bam> " << endl << endl; + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bam> " << endl << endl; - cerr << "Options: " << endl; - - cerr << "\t-bedpe\t" << "Write BEDPE format." << endl; - cerr << "\t\t- Requires BAM to be grouped or sorted by query." << endl << endl; - - cerr << "\t-bed12\t" << "Write \"blocked\" BED format (aka \"BED12\")." << endl << endl; - cerr << "\t\thttp://genome-test.cse.ucsc.edu/FAQ/FAQformat#format1" << endl << endl; + cerr << "Options: " << endl; - cerr << "\t-split\t" << "Report \"split\" BAM alignments as separate BED entries." << endl << endl; - - cerr << "\t-ed\t" << "Use BAM edit distance (NM tag) for BED score." << endl; - cerr << "\t\t- Default for BED is to use mapping quality." << endl; - cerr << "\t\t- Default for BEDPE is to use the minimum of" << endl; - cerr << "\t\t the two mapping qualities for the pair." << endl; - cerr << "\t\t- When -ed is used with -bedpe, the total edit" << endl; - cerr << "\t\t distance from the two mates is reported." << endl << endl; + cerr << "\t-bedpe\t" << "Write BEDPE format." << endl; + cerr << "\t\t- Requires BAM to be grouped or sorted by query." << endl << endl; - cerr << "\t-tag\t" << "Use other NUMERIC BAM alignment tag for BED score." << endl; - cerr << "\t\t- Default for BED is to use mapping quality." << endl; - cerr << "\t\t Disallowed with BEDPE output." << endl << endl; + cerr << "\t-bed12\t" << "Write \"blocked\" BED format (aka \"BED12\")." << endl << endl; + cerr << "\t\thttp://genome-test.cse.ucsc.edu/FAQ/FAQformat#format1" << endl << endl; - cerr << "\t-color\t" << "An R,G,B string for the color used with BED12 format." << endl; - cerr << "\t\tDefault is (255,0,0)." << endl << endl; - - cerr << "\t-cigar\t" << "Add the CIGAR string to the BED entry as a 7th column." << endl << endl; + cerr << "\t-split\t" << "Report \"split\" BAM alignments as separate BED entries." << endl << endl; + cerr << "\t-ed\t" << "Use BAM edit distance (NM tag) for BED score." << endl; + cerr << "\t\t- Default for BED is to use mapping quality." << endl; + cerr << "\t\t- Default for BEDPE is to use the minimum of" << endl; + cerr << "\t\t the two mapping qualities for the pair." << endl; + cerr << "\t\t- When -ed is used with -bedpe, the total edit" << endl; + cerr << "\t\t distance from the two mates is reported." << endl << endl; - // end the program here - exit(1); + cerr << "\t-tag\t" << "Use other NUMERIC BAM alignment tag for BED score." << endl; + cerr << "\t\t- Default for BED is to use mapping quality." << endl; + cerr << "\t\t Disallowed with BEDPE output." << endl << endl; + + cerr << "\t-color\t" << "An R,G,B string for the color used with BED12 format." << endl; + cerr << "\t\tDefault is (255,0,0)." << endl << endl; + + cerr << "\t-cigar\t" << "Add the CIGAR string to the BED entry as a 7th column." << endl << endl; + + + // end the program here + exit(1); } void ConvertBamToBed(const string &bamFile, const bool &useEditDistance, const string &bamTag, const bool &writeBed12, const bool &obeySplits, const string &color, const bool &useCigar) { - // open the BAM file - BamReader reader; - reader.Open(bamFile); - - // get header & reference information - string header = reader.GetHeaderText(); - RefVector refs = reader.GetReferenceData(); - - // rip through the BAM file and convert each mapped entry to BED - BamAlignment bam; - while (reader.GetNextAlignment(bam)) { - if (bam.IsMapped() == true) { - if (writeBed12 == false) // BED - PrintBed(bam, refs, useEditDistance, bamTag, obeySplits, useCigar); - else //"blocked" BED - PrintBed12(bam, refs, useEditDistance, bamTag, color); - } - } - reader.Close(); + // open the BAM file + BamReader reader; + reader.Open(bamFile); + + // get header & reference information + string header = reader.GetHeaderText(); + RefVector refs = reader.GetReferenceData(); + + // rip through the BAM file and convert each mapped entry to BED + BamAlignment bam; + while (reader.GetNextAlignment(bam)) { + if (bam.IsMapped() == true) { + if (writeBed12 == false) // BED + PrintBed(bam, refs, useEditDistance, bamTag, obeySplits, useCigar); + else //"blocked" BED + PrintBed12(bam, refs, useEditDistance, bamTag, color); + } + } + reader.Close(); } /* Assumptions: - 1. The BAM file is grouped/sorted by query name, + 1. The BAM file is grouped/sorted by query name, not alignment position */ void ConvertBamToBedpe(const string &bamFile, const bool &useEditDistance) { - // open the BAM file - BamReader reader; - reader.Open(bamFile); - - // get header & reference information - string header = reader.GetHeaderText(); - RefVector refs = reader.GetReferenceData(); - - // rip through the BAM file and convert each mapped entry to BEDPE - BamAlignment bam1, bam2; - while (reader.GetNextAlignment(bam1)) { - // the alignment must be paired - if (bam1.IsPaired() == true) { - // grab the second alignment for the pair. - reader.GetNextAlignment(bam2); - - // require that the alignments are from the same query - if (bam1.Name == bam2.Name) { - PrintBedPE(bam1, bam2, refs, useEditDistance); - } - else { - cerr << "*****ERROR: -bedpe requires BAM to be sorted/grouped by query name. " << endl; - exit(1); - } - } - } - reader.Close(); + // open the BAM file + BamReader reader; + reader.Open(bamFile); + + // get header & reference information + string header = reader.GetHeaderText(); + RefVector refs = reader.GetReferenceData(); + + // rip through the BAM file and convert each mapped entry to BEDPE + BamAlignment bam1, bam2; + while (reader.GetNextAlignment(bam1)) { + // the alignment must be paired + if (bam1.IsPaired() == true) { + // grab the second alignment for the pair. + reader.GetNextAlignment(bam2); + + // require that the alignments are from the same query + if (bam1.Name == bam2.Name) { + PrintBedPE(bam1, bam2, refs, useEditDistance); + } + else { + cerr << "*****ERROR: -bedpe requires BAM to be sorted/grouped by query name. " << endl; + exit(1); + } + } + } + reader.Close(); } void ParseCigarBed12(const vector<CigarOp> &cigar, vector<int> &blockStarts, vector<int> &blockLengths, unsigned int &alignmentEnd) { - int currPosition = 0; - int blockLength = 0; - - // Rip through the CIGAR ops and figure out if there is more - // than one block for this alignment - vector<CigarOp>::const_iterator cigItr = cigar.begin(); - vector<CigarOp>::const_iterator cigEnd = cigar.end(); - for (; cigItr != cigEnd; ++cigItr) { - switch (cigItr->Type) { - case ('M') : - blockLength += cigItr->Length; - currPosition += cigItr->Length; - case ('I') : break; - case ('S') : break; - case ('D') : break; - blockLength += cigItr->Length; - currPosition += cigItr->Length; - case ('P') : break; - case ('N') : - blockStarts.push_back(currPosition + cigItr->Length); - blockLengths.push_back(blockLength); - currPosition += cigItr->Length; - blockLength = 0; - case ('H') : break; // for 'H' - do nothing, move to next op - default : - printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here - exit(1); - } - } - // add the kast block and set the - // alignment end (i.e., relative to the start) - blockLengths.push_back(blockLength); - alignmentEnd = currPosition; + int currPosition = 0; + int blockLength = 0; + + // Rip through the CIGAR ops and figure out if there is more + // than one block for this alignment + vector<CigarOp>::const_iterator cigItr = cigar.begin(); + vector<CigarOp>::const_iterator cigEnd = cigar.end(); + for (; cigItr != cigEnd; ++cigItr) { + switch (cigItr->Type) { + case ('M') : + blockLength += cigItr->Length; + currPosition += cigItr->Length; + case ('I') : break; + case ('S') : break; + case ('D') : break; + blockLength += cigItr->Length; + currPosition += cigItr->Length; + case ('P') : break; + case ('N') : + blockStarts.push_back(currPosition + cigItr->Length); + blockLengths.push_back(blockLength); + currPosition += cigItr->Length; + blockLength = 0; + case ('H') : break; // for 'H' - do nothing, move to next op + default : + printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here + exit(1); + } + } + // add the kast block and set the + // alignment end (i.e., relative to the start) + blockLengths.push_back(blockLength); + alignmentEnd = currPosition; } string BuildCigarString(const vector<CigarOp> &cigar) { stringstream cigarString; - - for (size_t i = 0; i < cigar.size(); ++i) { - //cerr << cigar[i].Type << " " << cigar[i].Length << endl; - switch (cigar[i].Type) { - case ('M') : - case ('I') : - case ('D') : - case ('N') : - case ('S') : - case ('H') : - case ('P') : + + for (size_t i = 0; i < cigar.size(); ++i) { + //cerr << cigar[i].Type << " " << cigar[i].Length << endl; + switch (cigar[i].Type) { + case ('M') : + case ('I') : + case ('D') : + case ('N') : + case ('S') : + case ('H') : + case ('P') : cigarString << cigar[i].Length << cigar[i].Type; } - } + } return cigarString.str(); } void PrintBed(const BamAlignment &bam, const RefVector &refs, bool useEditDistance, const string &bamTag, bool obeySplits, bool useCigar) { - // set the strand - string strand = "+"; - if (bam.IsReverseStrand() == true) strand = "-"; + // set the strand + string strand = "+"; + if (bam.IsReverseStrand() == true) strand = "-"; - // set the name of the feature based on the sequence - string name = bam.Name; - if (bam.IsFirstMate() == true) name += "/1"; - if (bam.IsSecondMate() == true) name += "/2"; + // set the name of the feature based on the sequence + string name = bam.Name; + if (bam.IsFirstMate() == true) name += "/1"; + if (bam.IsSecondMate() == true) name += "/2"; - // get the unpadded (parm = false) end position based on the CIGAR - unsigned int alignmentEnd = bam.GetEndPosition(false); + // get the unpadded (parm = false) end position based on the CIGAR + unsigned int alignmentEnd = bam.GetEndPosition(false); // report the entire BAM footprint as a single BED entry if (obeySplits == false) { - // report the alignment in BED6 format. - if (useEditDistance == false && bamTag == "") { - printf("%s\t%d\t%d\t\%s\t%d\t%s", refs.at(bam.RefID).RefName.c_str(), bam.Position, - alignmentEnd, name.c_str(), bam.MapQuality, strand.c_str()); - } - else if (useEditDistance == true && bamTag == "") { - uint32_t editDistance; - if (bam.GetTag("NM", editDistance)) { - printf("%s\t%d\t%d\t\%s\t%u\t%s", refs.at(bam.RefID).RefName.c_str(), bam.Position, - alignmentEnd, name.c_str(), editDistance, strand.c_str()); - } - else { - cerr << "The edit distance tag (NM) was not found in the BAM file. Please disable -ed. Exiting\n"; - exit(1); - } - } - else if (useEditDistance == false && bamTag != "") { - int32_t tagValue; - if (bam.GetTag(bamTag, tagValue)) { - printf("%s\t%d\t%d\t\%s\t%d\t%s", refs.at(bam.RefID).RefName.c_str(), bam.Position, - alignmentEnd, name.c_str(), tagValue, strand.c_str()); - } - else { - cerr << "The requested tag (" << bamTag << ") was not found in the BAM file. Exiting\n"; - exit(1); - } - } - - // does the user want CIGAR as well? - if (useCigar == false) { + // report the alignment in BED6 format. + if (useEditDistance == false && bamTag == "") { + printf("%s\t%d\t%d\t\%s\t%d\t%s", refs.at(bam.RefID).RefName.c_str(), bam.Position, + alignmentEnd, name.c_str(), bam.MapQuality, strand.c_str()); + } + else if (useEditDistance == true && bamTag == "") { + uint32_t editDistance; + if (bam.GetTag("NM", editDistance)) { + printf("%s\t%d\t%d\t\%s\t%u\t%s", refs.at(bam.RefID).RefName.c_str(), bam.Position, + alignmentEnd, name.c_str(), editDistance, strand.c_str()); + } + else { + cerr << "The edit distance tag (NM) was not found in the BAM file. Please disable -ed. Exiting\n"; + exit(1); + } + } + else if (useEditDistance == false && bamTag != "") { + int32_t tagValue; + if (bam.GetTag(bamTag, tagValue)) { + printf("%s\t%d\t%d\t\%s\t%d\t%s", refs.at(bam.RefID).RefName.c_str(), bam.Position, + alignmentEnd, name.c_str(), tagValue, strand.c_str()); + } + else { + cerr << "The requested tag (" << bamTag << ") was not found in the BAM file. Exiting\n"; + exit(1); + } + } + + // does the user want CIGAR as well? + if (useCigar == false) { printf("\n"); - } - else { + } + else { string cigar = BuildCigarString(bam.CigarData); printf("\t%s\n", cigar.c_str()); - } - } + } + } // Report each chunk of the BAM alignment as a discrete BED entry // For example 10M100N10M would be reported as two seprate BED entries of length 10 - else { + else { vector<BED> bedBlocks; - // Load the alignment blocks in bam into the bedBlocks vector. + // Load the alignment blocks in bam into the bedBlocks vector. // Don't trigger a new block when a "D" (deletion) CIGAR op is found. getBamBlocks(bam, refs, bedBlocks, false); - + vector<BED>::const_iterator bedItr = bedBlocks.begin(); - vector<BED>::const_iterator bedEnd = bedBlocks.end(); - for (; bedItr != bedEnd; ++bedItr) { - printf("%s\t%d\t%d\t\%s\t%d\t%s\n", refs.at(bam.RefID).RefName.c_str(), bedItr->start, - bedItr->end, name.c_str(), bam.MapQuality, strand.c_str()); - } - } + vector<BED>::const_iterator bedEnd = bedBlocks.end(); + for (; bedItr != bedEnd; ++bedItr) { + printf("%s\t%d\t%d\t\%s\t%d\t%s\n", refs.at(bam.RefID).RefName.c_str(), bedItr->start, + bedItr->end, name.c_str(), bam.MapQuality, strand.c_str()); + } + } } void PrintBed12(const BamAlignment &bam, const RefVector &refs, bool useEditDistance, const string &bamTag, string color) { - // set the strand - string strand = "+"; - if (bam.IsReverseStrand()) strand = "-"; - - // set the name of the feature based on the sequence - string name = bam.Name; - if (bam.IsFirstMate()) name += "/1"; - if (bam.IsSecondMate()) name += "/2"; - - // parse the CIGAR string and figure out the alignment blocks - unsigned int alignmentEnd; - vector<int> blockLengths; - vector<int> blockStarts; - blockStarts.push_back(0); - - // extract the block starts and lengths from the CIGAR string - ParseCigarBed12(bam.CigarData, blockStarts, blockLengths, alignmentEnd); - alignmentEnd += bam.Position; - + // set the strand + string strand = "+"; + if (bam.IsReverseStrand()) strand = "-"; + + // set the name of the feature based on the sequence + string name = bam.Name; + if (bam.IsFirstMate()) name += "/1"; + if (bam.IsSecondMate()) name += "/2"; + + // parse the CIGAR string and figure out the alignment blocks + unsigned int alignmentEnd; + vector<int> blockLengths; + vector<int> blockStarts; + blockStarts.push_back(0); + + // extract the block starts and lengths from the CIGAR string + ParseCigarBed12(bam.CigarData, blockStarts, blockLengths, alignmentEnd); + alignmentEnd += bam.Position; + // write BED6 portion if (useEditDistance == false && bamTag == "") { printf("%s\t%d\t%d\t\%s\t%d\t%s\t", refs.at(bam.RefID).RefName.c_str(), bam.Position, @@ -440,125 +440,125 @@ void PrintBed12(const BamAlignment &bam, const RefVector &refs, bool useEditDist exit(1); } } - else if (useEditDistance == false && bamTag != "") { - int32_t tagValue; - if (bam.GetTag(bamTag, tagValue)) { - printf("%s\t%d\t%d\t\%s\t%d\t%s\n", refs.at(bam.RefID).RefName.c_str(), bam.Position, - alignmentEnd, name.c_str(), tagValue, strand.c_str()); - } - else { - cerr << "The requested tag (" << bamTag << ") was not found in the BAM file. Exiting\n"; - exit(1); - } - } - - // write the colors, etc. - printf("%d\t%d\t%s\t%d\t", bam.Position, alignmentEnd, color.c_str(), (int) blockStarts.size()); - - // now write the lengths portion - unsigned int b; - for (b = 0; b < blockLengths.size() - 1; ++b) { - printf("%d,", blockLengths[b]); - } - printf("%d\t", blockLengths[b]); - - // now write the starts portion - for (b = 0; b < blockStarts.size() - 1; ++b) { - printf("%d,", blockStarts[b]); - } - printf("%d\n", blockStarts[b]); + else if (useEditDistance == false && bamTag != "") { + int32_t tagValue; + if (bam.GetTag(bamTag, tagValue)) { + printf("%s\t%d\t%d\t\%s\t%d\t%s\n", refs.at(bam.RefID).RefName.c_str(), bam.Position, + alignmentEnd, name.c_str(), tagValue, strand.c_str()); + } + else { + cerr << "The requested tag (" << bamTag << ") was not found in the BAM file. Exiting\n"; + exit(1); + } + } + + // write the colors, etc. + printf("%d\t%d\t%s\t%d\t", bam.Position, alignmentEnd, color.c_str(), (int) blockStarts.size()); + + // now write the lengths portion + unsigned int b; + for (b = 0; b < blockLengths.size() - 1; ++b) { + printf("%d,", blockLengths[b]); + } + printf("%d\t", blockLengths[b]); + + // now write the starts portion + for (b = 0; b < blockStarts.size() - 1; ++b) { + printf("%d,", blockStarts[b]); + } + printf("%d\n", blockStarts[b]); } void PrintBedPE(const BamAlignment &bam1, const BamAlignment &bam2, const RefVector &refs, bool useEditDistance) { - // initialize BEDPE variables - string chrom1, chrom2, strand1, strand2; - int start1, start2, end1, end2; - uint32_t editDistance1, editDistance2; - start1 = start2 = end1 = end2 = -1; - chrom1 = chrom2 = strand1 = strand2 = "."; - editDistance1 = editDistance2 = 0; - uint16_t minMapQuality = 0; - - // extract relevant info for end 1 - if (bam1.IsMapped()) { - chrom1 = refs.at(bam1.RefID).RefName; - start1 = bam1.Position; - end1 = bam1.GetEndPosition(false); - strand1 = "+"; - if (bam1.IsReverseStrand()) strand1 = "-"; - - // extract the edit distance from the NM tag - // if possible. otherwise, complain. - if (useEditDistance == true && bam1.GetTag("NM", editDistance1) == false) { - cerr << "The edit distance tag (NM) was not found in the BAM file. Please disable -ed. Exiting\n"; - exit(1); - } - } - - // extract relevant info for end 2 - if (bam2.IsMapped()) { - chrom2 = refs.at(bam2.RefID).RefName; - start2 = bam2.Position; - end2 = bam2.GetEndPosition(false); - strand2 = "+"; - if (bam2.IsReverseStrand()) strand2 = "-"; - - // extract the edit distance from the NM tag - // if possible. otherwise, complain. - if (useEditDistance == true && bam2.GetTag("NM", editDistance2) == false) { - cerr << "The edit distance tag (NM) was not found in the BAM file. Please disable -ed. Exiting\n"; - exit(1); - } - } - - // swap the ends if necessary - if ( chrom1 > chrom2 || ((chrom1 == chrom2) && (start1 > start2)) ) { - swap(chrom1, chrom2); - swap(start1, start2); - swap(end1, end2); - swap(strand1, strand2); - } - - // report BEDPE using min mapQuality - if (useEditDistance == false) { - // compute the minimum mapping quality b/w the two ends of the pair. - if (bam1.IsMapped() == true && bam2.IsMapped() == true) - minMapQuality = min(bam1.MapQuality, bam2.MapQuality); - - printf("%s\t%d\t%d\t\%s\t%d\t%d\t%s\t%d\t%s\t%s\n", - chrom1.c_str(), start1, end1, chrom2.c_str(), start2, end2, - bam1.Name.c_str(), minMapQuality, strand1.c_str(), strand2.c_str()); - } - // report BEDPE using total edit distance - else { - uint16_t totalEditDistance = 0; - if (bam1.IsMapped() == true && bam2.IsMapped() == true) - totalEditDistance = editDistance1 + editDistance2; - else if (bam1.IsMapped() == true) - totalEditDistance = editDistance1; - else if (bam2.IsMapped() == true) - totalEditDistance = editDistance2; - - printf("%s\t%d\t%d\t\%s\t%d\t%d\t%s\t%d\t%s\t%s\n", - chrom1.c_str(), start1, end1, chrom2.c_str(), start2, end2, - bam1.Name.c_str(), totalEditDistance, strand1.c_str(), strand2.c_str()); - } + // initialize BEDPE variables + string chrom1, chrom2, strand1, strand2; + int start1, start2, end1, end2; + uint32_t editDistance1, editDistance2; + start1 = start2 = end1 = end2 = -1; + chrom1 = chrom2 = strand1 = strand2 = "."; + editDistance1 = editDistance2 = 0; + uint16_t minMapQuality = 0; + + // extract relevant info for end 1 + if (bam1.IsMapped()) { + chrom1 = refs.at(bam1.RefID).RefName; + start1 = bam1.Position; + end1 = bam1.GetEndPosition(false); + strand1 = "+"; + if (bam1.IsReverseStrand()) strand1 = "-"; + + // extract the edit distance from the NM tag + // if possible. otherwise, complain. + if (useEditDistance == true && bam1.GetTag("NM", editDistance1) == false) { + cerr << "The edit distance tag (NM) was not found in the BAM file. Please disable -ed. Exiting\n"; + exit(1); + } + } + + // extract relevant info for end 2 + if (bam2.IsMapped()) { + chrom2 = refs.at(bam2.RefID).RefName; + start2 = bam2.Position; + end2 = bam2.GetEndPosition(false); + strand2 = "+"; + if (bam2.IsReverseStrand()) strand2 = "-"; + + // extract the edit distance from the NM tag + // if possible. otherwise, complain. + if (useEditDistance == true && bam2.GetTag("NM", editDistance2) == false) { + cerr << "The edit distance tag (NM) was not found in the BAM file. Please disable -ed. Exiting\n"; + exit(1); + } + } + + // swap the ends if necessary + if ( chrom1 > chrom2 || ((chrom1 == chrom2) && (start1 > start2)) ) { + swap(chrom1, chrom2); + swap(start1, start2); + swap(end1, end2); + swap(strand1, strand2); + } + + // report BEDPE using min mapQuality + if (useEditDistance == false) { + // compute the minimum mapping quality b/w the two ends of the pair. + if (bam1.IsMapped() == true && bam2.IsMapped() == true) + minMapQuality = min(bam1.MapQuality, bam2.MapQuality); + + printf("%s\t%d\t%d\t\%s\t%d\t%d\t%s\t%d\t%s\t%s\n", + chrom1.c_str(), start1, end1, chrom2.c_str(), start2, end2, + bam1.Name.c_str(), minMapQuality, strand1.c_str(), strand2.c_str()); + } + // report BEDPE using total edit distance + else { + uint16_t totalEditDistance = 0; + if (bam1.IsMapped() == true && bam2.IsMapped() == true) + totalEditDistance = editDistance1 + editDistance2; + else if (bam1.IsMapped() == true) + totalEditDistance = editDistance1; + else if (bam2.IsMapped() == true) + totalEditDistance = editDistance2; + + printf("%s\t%d\t%d\t\%s\t%d\t%d\t%s\t%d\t%s\t%s\n", + chrom1.c_str(), start1, end1, chrom2.c_str(), start2, end2, + bam1.Name.c_str(), totalEditDistance, strand1.c_str(), strand2.c_str()); + } } // deprecated. bool IsCorrectMappingForBEDPE (const BamAlignment &bam) { - if ( (bam.RefID == bam.MateRefID) && (bam.InsertSize > 0) ) { - return true; - } - else if ( (bam.RefID == bam.MateRefID) && (bam.InsertSize == 0) && bam.IsFirstMate() ) { - return true; - } - else if ( (bam.RefID != bam.MateRefID) && bam.IsFirstMate() ) { - return true; - } - else return false; + if ( (bam.RefID == bam.MateRefID) && (bam.InsertSize > 0) ) { + return true; + } + else if ( (bam.RefID == bam.MateRefID) && (bam.InsertSize == 0) && bam.IsFirstMate() ) { + return true; + } + else if ( (bam.RefID != bam.MateRefID) && bam.IsFirstMate() ) { + return true; + } + else return false; } diff --git a/src/bed12ToBed6/bed12ToBed6.cpp b/src/bed12ToBed6/bed12ToBed6.cpp index e73471b6176f1f99e3eb4459becaf30e5a5092ee..a0273664e04d20e4c10d91f8a3631f3cc9fccb69 100644 --- a/src/bed12ToBed6/bed12ToBed6.cpp +++ b/src/bed12ToBed6/bed12ToBed6.cpp @@ -37,115 +37,115 @@ void ProcessBed(istream &bedInput, BedFile *bed); int main(int argc, char* argv[]) { - // our configuration variables - bool showHelp = false; - - // input files - string bedFile = "stdin"; - bool haveBed = true; - - for(int i = 1; i < argc; i++) { - int parameterLength = (int)strlen(argv[i]); - - if((PARAMETER_CHECK("-h", 2, parameterLength)) || - (PARAMETER_CHECK("--help", 5, parameterLength))) { - showHelp = true; - } - } - - if(showHelp) ShowHelp(); - - // do some parsing (all of these parameters require 2 strings) - for(int i = 1; i < argc; i++) { - - int parameterLength = (int)strlen(argv[i]); - - if(PARAMETER_CHECK("-i", 2, parameterLength)) { - if ((i+1) < argc) { - bedFile = argv[i + 1]; - i++; - } - } - else { - cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; - showHelp = true; - } - } - - // make sure we have an input files - if (!haveBed ) { - cerr << endl << "*****" << endl << "*****ERROR: Need -i (BED) file. " << endl << "*****" << endl; - showHelp = true; - } - - if (!showHelp) { - BedFile *bed = new BedFile(bedFile); - DetermineBedInput(bed); - } - else { - ShowHelp(); - } + // our configuration variables + bool showHelp = false; + + // input files + string bedFile = "stdin"; + bool haveBed = true; + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if((PARAMETER_CHECK("-h", 2, parameterLength)) || + (PARAMETER_CHECK("--help", 5, parameterLength))) { + showHelp = true; + } + } + + if(showHelp) ShowHelp(); + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-i", 2, parameterLength)) { + if ((i+1) < argc) { + bedFile = argv[i + 1]; + i++; + } + } + else { + cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; + showHelp = true; + } + } + + // make sure we have an input files + if (!haveBed ) { + cerr << endl << "*****" << endl << "*****ERROR: Need -i (BED) file. " << endl << "*****" << endl; + showHelp = true; + } + + if (!showHelp) { + BedFile *bed = new BedFile(bedFile); + DetermineBedInput(bed); + } + else { + ShowHelp(); + } } void ShowHelp(void) { - cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; - - cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; + cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; - cerr << "Summary: Splits BED12 features into discrete BED6 features." << endl << endl; + cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; - cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bed12>" << endl << endl; + cerr << "Summary: Splits BED12 features into discrete BED6 features." << endl << endl; + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bed12>" << endl << endl; - // end the program here - exit(1); + + // end the program here + exit(1); } void DetermineBedInput(BedFile *bed) { - - // dealing with a proper file - if (bed->bedFile != "stdin") { - - ifstream bedStream(bed->bedFile.c_str(), ios::in); - if ( !bedStream ) { - cerr << "Error: The requested bed file (" << bed->bedFile << ") could not be opened. Exiting!" << endl; - exit (1); - } - ProcessBed(bedStream, bed); - } - // reading from stdin - else { - ProcessBed(cin, bed); - } + + // dealing with a proper file + if (bed->bedFile != "stdin") { + + ifstream bedStream(bed->bedFile.c_str(), ios::in); + if ( !bedStream ) { + cerr << "Error: The requested bed file (" << bed->bedFile << ") could not be opened. Exiting!" << endl; + exit (1); + } + ProcessBed(bedStream, bed); + } + // reading from stdin + else { + ProcessBed(cin, bed); + } } void ProcessBed(istream &bedInput, BedFile *bed) { - // process each BED entry and convert to BAM - BED bedEntry, nullBed; - int lineNum = 0; - BedLineStatus bedStatus; - // open the BED file for reading. - bed->Open(); - while ((bedStatus = bed->GetNextBed(bedEntry, lineNum)) != BED_INVALID) { - if (bedStatus == BED_VALID) { - - bedVector bedBlocks; // vec to store the discrete BED "blocks" from a + // process each BED entry and convert to BAM + BED bedEntry, nullBed; + int lineNum = 0; + BedLineStatus bedStatus; + // open the BED file for reading. + bed->Open(); + while ((bedStatus = bed->GetNextBed(bedEntry, lineNum)) != BED_INVALID) { + if (bedStatus == BED_VALID) { + + bedVector bedBlocks; // vec to store the discrete BED "blocks" from a splitBedIntoBlocks(bedEntry, lineNum, bedBlocks); - + vector<BED>::const_iterator bedItr = bedBlocks.begin(); - vector<BED>::const_iterator bedEnd = bedBlocks.end(); - for (; bedItr != bedEnd; ++bedItr) { - printf ("%s\t%d\t%d\t%s\t%s\t%s\n", bedItr->chrom.c_str(), bedItr->start, bedItr->end, bedItr->name.c_str(), - bedItr->score.c_str(), bedItr->strand.c_str()); - } - bedEntry = nullBed; - } - } - // close up - bed->Close(); + vector<BED>::const_iterator bedEnd = bedBlocks.end(); + for (; bedItr != bedEnd; ++bedItr) { + printf ("%s\t%d\t%d\t%s\t%s\t%s\n", bedItr->chrom.c_str(), bedItr->start, bedItr->end, bedItr->name.c_str(), + bedItr->score.c_str(), bedItr->strand.c_str()); + } + bedEntry = nullBed; + } + } + // close up + bed->Close(); } diff --git a/src/bedToBam/bedToBam.cpp b/src/bedToBam/bedToBam.cpp index 010fbb20fabdc202fd15c322a50981d5efc49cc0..78d9f6d65760af41c509316e56ff53e098267f75 100644 --- a/src/bedToBam/bedToBam.cpp +++ b/src/bedToBam/bedToBam.cpp @@ -45,309 +45,309 @@ int reg2bin(int beg, int end); int main(int argc, char* argv[]) { - // our configuration variables - bool showHelp = false; - - // input files - string bedFile = "stdin"; - string genomeFile; - - unsigned int mapQual = 255; - - bool haveBed = true; - bool haveGenome = false; - bool haveMapQual = false; - bool isBED12 = false; - bool uncompressedBam = false; - - for(int i = 1; i < argc; i++) { - int parameterLength = (int)strlen(argv[i]); - - if((PARAMETER_CHECK("-h", 2, parameterLength)) || - (PARAMETER_CHECK("--help", 5, parameterLength))) { - showHelp = true; - } - } - - if(showHelp) ShowHelp(); - - // do some parsing (all of these parameters require 2 strings) - for(int i = 1; i < argc; i++) { - - int parameterLength = (int)strlen(argv[i]); - - if(PARAMETER_CHECK("-i", 2, parameterLength)) { - if ((i+1) < argc) { - bedFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-g", 2, parameterLength)) { - if ((i+1) < argc) { - haveGenome = true; - genomeFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-mapq", 5, parameterLength)) { - haveMapQual = true; - if ((i+1) < argc) { - mapQual = atoi(argv[i + 1]); - i++; - } - } - else if(PARAMETER_CHECK("-bed12", 6, parameterLength)) { - isBED12 = true; - } - else if(PARAMETER_CHECK("-ubam", 5, parameterLength)) { + // our configuration variables + bool showHelp = false; + + // input files + string bedFile = "stdin"; + string genomeFile; + + unsigned int mapQual = 255; + + bool haveBed = true; + bool haveGenome = false; + bool haveMapQual = false; + bool isBED12 = false; + bool uncompressedBam = false; + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if((PARAMETER_CHECK("-h", 2, parameterLength)) || + (PARAMETER_CHECK("--help", 5, parameterLength))) { + showHelp = true; + } + } + + if(showHelp) ShowHelp(); + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-i", 2, parameterLength)) { + if ((i+1) < argc) { + bedFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-g", 2, parameterLength)) { + if ((i+1) < argc) { + haveGenome = true; + genomeFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-mapq", 5, parameterLength)) { + haveMapQual = true; + if ((i+1) < argc) { + mapQual = atoi(argv[i + 1]); + i++; + } + } + else if(PARAMETER_CHECK("-bed12", 6, parameterLength)) { + isBED12 = true; + } + else if(PARAMETER_CHECK("-ubam", 5, parameterLength)) { uncompressedBam = true; - } - else { - cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; - showHelp = true; - } - } - - // make sure we have an input files - if (!haveBed ) { - cerr << endl << "*****" << endl << "*****ERROR: Need -i (BED) file. " << endl << "*****" << endl; - showHelp = true; - } - if (!haveGenome ) { - cerr << endl << "*****" << endl << "*****ERROR: Need -g (genome) file. " << endl << "*****" << endl; - showHelp = true; - } - if (mapQual < 0 || mapQual > 255) { - cerr << endl << "*****" << endl << "*****ERROR: MAPQ must be in range [0,255]. " << endl << "*****" << endl; - showHelp = true; - } - - - if (!showHelp) { - BedFile *bed = new BedFile(bedFile); - GenomeFile *genome = new GenomeFile(genomeFile); - + } + else { + cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; + showHelp = true; + } + } + + // make sure we have an input files + if (!haveBed ) { + cerr << endl << "*****" << endl << "*****ERROR: Need -i (BED) file. " << endl << "*****" << endl; + showHelp = true; + } + if (!haveGenome ) { + cerr << endl << "*****" << endl << "*****ERROR: Need -g (genome) file. " << endl << "*****" << endl; + showHelp = true; + } + if (mapQual < 0 || mapQual > 255) { + cerr << endl << "*****" << endl << "*****ERROR: MAPQ must be in range [0,255]. " << endl << "*****" << endl; + showHelp = true; + } + + + if (!showHelp) { + BedFile *bed = new BedFile(bedFile); + GenomeFile *genome = new GenomeFile(genomeFile); + ProcessBed(cin, bed, genome, isBED12, mapQual, uncompressedBam); - } - else { - ShowHelp(); - } + } + else { + ShowHelp(); + } } void ShowHelp(void) { - cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; - - cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; + cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; + + cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; - cerr << "Summary: Converts feature records to BAM format." << endl << endl; + cerr << "Summary: Converts feature records to BAM format." << endl << endl; - cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bed/gff/vcf> -g <genome>" << endl << endl; + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bed/gff/vcf> -g <genome>" << endl << endl; - cerr << "Options: " << endl; + cerr << "Options: " << endl; - cerr << "\t-mapq\t" << "Set the mappinq quality for the BAM records." << endl; - cerr << "\t\t(INT) Default: 255" << endl << endl; + cerr << "\t-mapq\t" << "Set the mappinq quality for the BAM records." << endl; + cerr << "\t\t(INT) Default: 255" << endl << endl; - cerr << "\t-bed12\t" << "The BED file is in BED12 format. The BAM CIGAR" << endl; - cerr << "\t\tstring will reflect BED \"blocks\"." << endl << endl; + cerr << "\t-bed12\t" << "The BED file is in BED12 format. The BAM CIGAR" << endl; + cerr << "\t\tstring will reflect BED \"blocks\"." << endl << endl; - cerr << "\t-ubam\t" << "Write uncompressed BAM output. Default is to write compressed BAM." << endl << endl; + cerr << "\t-ubam\t" << "Write uncompressed BAM output. Default is to write compressed BAM." << endl << endl; - cerr << "Notes: " << endl; - cerr << "\t(1) BED files must be at least BED4 to be amenable to BAM (needs name field)." << endl << endl; + cerr << "Notes: " << endl; + cerr << "\t(1) BED files must be at least BED4 to be amenable to BAM (needs name field)." << endl << endl; - // end the program here - exit(1); + // end the program here + exit(1); } void ProcessBed(istream &bedInput, BedFile *bed, GenomeFile *genome, bool isBED12, int mapQual, bool uncompressedBam) { - BamWriter *writer = new BamWriter(); - - // build a BAM header from the genomeFile - RefVector refs; - string bamHeader; - map<string, int, std::less<string> > chromToId; - MakeBamHeader(genome->getGenomeFileName(), refs, bamHeader, chromToId); - - // open a BAM and add the reference headers to the BAM file - writer->Open("stdout", bamHeader, refs, uncompressedBam); - - - // process each BED entry and convert to BAM - BED bedEntry, nullBed; - int lineNum = 0; - BedLineStatus bedStatus; - // open the BED file for reading. - bed->Open(); - while ((bedStatus = bed->GetNextBed(bedEntry, lineNum)) != BED_INVALID) { - if (bedStatus == BED_VALID) { - BamAlignment bamEntry; - if (bed->bedType >= 4) { - ConvertBedToBam(bedEntry, bamEntry, chromToId, isBED12, mapQual, lineNum); - writer->SaveAlignment(bamEntry); - } - else { - cerr << "Error: BED entry without name found at line: " << lineNum << ". Exiting!" << endl; - exit (1); - } - bedEntry = nullBed; - } - } - // close up - bed->Close(); - writer->Close(); + BamWriter *writer = new BamWriter(); + + // build a BAM header from the genomeFile + RefVector refs; + string bamHeader; + map<string, int, std::less<string> > chromToId; + MakeBamHeader(genome->getGenomeFileName(), refs, bamHeader, chromToId); + + // open a BAM and add the reference headers to the BAM file + writer->Open("stdout", bamHeader, refs, uncompressedBam); + + + // process each BED entry and convert to BAM + BED bedEntry, nullBed; + int lineNum = 0; + BedLineStatus bedStatus; + // open the BED file for reading. + bed->Open(); + while ((bedStatus = bed->GetNextBed(bedEntry, lineNum)) != BED_INVALID) { + if (bedStatus == BED_VALID) { + BamAlignment bamEntry; + if (bed->bedType >= 4) { + ConvertBedToBam(bedEntry, bamEntry, chromToId, isBED12, mapQual, lineNum); + writer->SaveAlignment(bamEntry); + } + else { + cerr << "Error: BED entry without name found at line: " << lineNum << ". Exiting!" << endl; + exit (1); + } + bedEntry = nullBed; + } + } + // close up + bed->Close(); + writer->Close(); } -void ConvertBedToBam(const BED &bed, BamAlignment &bam, map<string, int, std::less<string> > &chromToId, +void ConvertBedToBam(const BED &bed, BamAlignment &bam, map<string, int, std::less<string> > &chromToId, bool isBED12, int mapQual, int lineNum) { - - bam.Name = bed.name; - bam.Position = bed.start; - bam.Bin = reg2bin(bed.start, bed.end); - - // hard-code the sequence and qualities. - int bedLength = bed.end - bed.start; - - // set dummy seq and qual strings. the input is BED, - // so the sequence is inherently the same as it's - // reference genome. - // Thanks to James M. Ward for pointing this out. - bam.QueryBases = ""; - bam.Qualities = ""; - - // chrom and map quality - bam.RefID = chromToId[bed.chrom]; - bam.MapQuality = mapQual; - - // set the BAM FLAG - bam.AlignmentFlag = 0; - if (bed.strand == "-") - bam.SetIsReverseStrand(true); - - bam.MatePosition = -1; - bam.InsertSize = 0; - bam.MateRefID = -1; - - bam.CigarData.clear(); - - if (isBED12 == false) { - CigarOp cOp; - cOp.Type = 'M'; - cOp.Length = bedLength; - bam.CigarData.push_back(cOp); - } - // we're being told that the input is BED12. - else{ - - // does it smell like BED12? if so, process it. - if (bed.otherFields.size() == 6) { - - // extract the relevant BED fields to convert BED12 to BAM - // namely: blockCount, blockStarts, blockEnds - unsigned int blockCount = atoi(bed.otherFields[3].c_str()); - - vector<int> blockSizes, blockStarts; - Tokenize(bed.otherFields[4], blockSizes, ","); - Tokenize(bed.otherFields[5], blockStarts, ","); - - // make sure this is a well-formed BED12 entry. - if (blockSizes.size() != blockCount) { - cerr << "Error: Number of BED blocks does not match blockCount at line: " << lineNum << ". Exiting!" << endl; - exit (1); - } - else { - // does the first block start after the bed.start? - // if so, we need to do some "splicing" - if (blockStarts[0] > 0) { - CigarOp cOp; - cOp.Length = blockStarts[0]; - cOp.Type = 'N'; - bam.CigarData.push_back(cOp); - } - // handle the "middle" blocks - for (unsigned int i = 0; i < blockCount - 1; ++i) { - CigarOp cOp; - cOp.Length = blockSizes[i]; - cOp.Type = 'M'; - bam.CigarData.push_back(cOp); - - if (blockStarts[i+1] > (blockStarts[i] + blockSizes[i])) { - CigarOp cOp; - cOp.Length = (blockStarts[i+1] - (blockStarts[i] + blockSizes[i])); - cOp.Type = 'N'; - bam.CigarData.push_back(cOp); - } - } - // handle the last block. - CigarOp cOp; - cOp.Length = blockSizes[blockCount - 1]; - cOp.Type = 'M'; - bam.CigarData.push_back(cOp); - } - } - // it doesn't smell like BED12. complain. - else { - cerr << "You've indicated that the input file is in BED12 format, yet the relevant fields cannot be found. Exiting." << endl << endl; - exit(1); - } - } + + bam.Name = bed.name; + bam.Position = bed.start; + bam.Bin = reg2bin(bed.start, bed.end); + + // hard-code the sequence and qualities. + int bedLength = bed.end - bed.start; + + // set dummy seq and qual strings. the input is BED, + // so the sequence is inherently the same as it's + // reference genome. + // Thanks to James M. Ward for pointing this out. + bam.QueryBases = ""; + bam.Qualities = ""; + + // chrom and map quality + bam.RefID = chromToId[bed.chrom]; + bam.MapQuality = mapQual; + + // set the BAM FLAG + bam.AlignmentFlag = 0; + if (bed.strand == "-") + bam.SetIsReverseStrand(true); + + bam.MatePosition = -1; + bam.InsertSize = 0; + bam.MateRefID = -1; + + bam.CigarData.clear(); + + if (isBED12 == false) { + CigarOp cOp; + cOp.Type = 'M'; + cOp.Length = bedLength; + bam.CigarData.push_back(cOp); + } + // we're being told that the input is BED12. + else{ + + // does it smell like BED12? if so, process it. + if (bed.otherFields.size() == 6) { + + // extract the relevant BED fields to convert BED12 to BAM + // namely: blockCount, blockStarts, blockEnds + unsigned int blockCount = atoi(bed.otherFields[3].c_str()); + + vector<int> blockSizes, blockStarts; + Tokenize(bed.otherFields[4], blockSizes, ","); + Tokenize(bed.otherFields[5], blockStarts, ","); + + // make sure this is a well-formed BED12 entry. + if (blockSizes.size() != blockCount) { + cerr << "Error: Number of BED blocks does not match blockCount at line: " << lineNum << ". Exiting!" << endl; + exit (1); + } + else { + // does the first block start after the bed.start? + // if so, we need to do some "splicing" + if (blockStarts[0] > 0) { + CigarOp cOp; + cOp.Length = blockStarts[0]; + cOp.Type = 'N'; + bam.CigarData.push_back(cOp); + } + // handle the "middle" blocks + for (unsigned int i = 0; i < blockCount - 1; ++i) { + CigarOp cOp; + cOp.Length = blockSizes[i]; + cOp.Type = 'M'; + bam.CigarData.push_back(cOp); + + if (blockStarts[i+1] > (blockStarts[i] + blockSizes[i])) { + CigarOp cOp; + cOp.Length = (blockStarts[i+1] - (blockStarts[i] + blockSizes[i])); + cOp.Type = 'N'; + bam.CigarData.push_back(cOp); + } + } + // handle the last block. + CigarOp cOp; + cOp.Length = blockSizes[blockCount - 1]; + cOp.Type = 'M'; + bam.CigarData.push_back(cOp); + } + } + // it doesn't smell like BED12. complain. + else { + cerr << "You've indicated that the input file is in BED12 format, yet the relevant fields cannot be found. Exiting." << endl << endl; + exit(1); + } + } } -void MakeBamHeader(const string &genomeFile, RefVector &refs, string &header, +void MakeBamHeader(const string &genomeFile, RefVector &refs, string &header, map<string, int, std::less<string> > &chromToId) { - - // make a genome map of the genome file. - GenomeFile genome(genomeFile); - - header += "@HD\tVN:1.0\tSO:unsorted\n"; - header += "@PG\tID:BEDTools_bedToBam\tVN:V"; - header += VERSION; - header += "\n"; - - int chromId = 0; - vector<string> chromList = genome.getChromList(); - sort(chromList.begin(), chromList.end()); - - // create a BAM header (@SQ) entry for each chrom in the BEDTools genome file. - vector<string>::const_iterator genomeItr = chromList.begin(); - vector<string>::const_iterator genomeEnd = chromList.end(); - for (; genomeItr != genomeEnd; ++genomeItr) { - chromToId[*genomeItr] = chromId; - chromId++; - - // add to the header text - int size = genome.getChromSize(*genomeItr); - string chromLine = "@SQ\tSN:" + *genomeItr + "\tAS:" + genomeFile + "\tLN:" + ToString(size) + "\n"; - header += chromLine; - - // create a chrom entry and add it to - // the RefVector - RefData chrom; - chrom.RefName = *genomeItr; - chrom.RefLength = size; - chrom.RefHasAlignments = false; - refs.push_back(chrom); - } + + // make a genome map of the genome file. + GenomeFile genome(genomeFile); + + header += "@HD\tVN:1.0\tSO:unsorted\n"; + header += "@PG\tID:BEDTools_bedToBam\tVN:V"; + header += VERSION; + header += "\n"; + + int chromId = 0; + vector<string> chromList = genome.getChromList(); + sort(chromList.begin(), chromList.end()); + + // create a BAM header (@SQ) entry for each chrom in the BEDTools genome file. + vector<string>::const_iterator genomeItr = chromList.begin(); + vector<string>::const_iterator genomeEnd = chromList.end(); + for (; genomeItr != genomeEnd; ++genomeItr) { + chromToId[*genomeItr] = chromId; + chromId++; + + // add to the header text + int size = genome.getChromSize(*genomeItr); + string chromLine = "@SQ\tSN:" + *genomeItr + "\tAS:" + genomeFile + "\tLN:" + ToString(size) + "\n"; + header += chromLine; + + // create a chrom entry and add it to + // the RefVector + RefData chrom; + chrom.RefName = *genomeItr; + chrom.RefLength = size; + chrom.RefHasAlignments = false; + refs.push_back(chrom); + } } /* Taken directly from the SAMTools spec calculate bin given an alignment in [beg,end) (zero-based, half-close, half-open) */ int reg2bin(int beg, int end) { - --end; - if (beg>>14 == end>>14) return ((1<<15)-1)/7 + (beg>>14); - if (beg>>17 == end>>17) return ((1<<12)-1)/7 + (beg>>17); - if (beg>>20 == end>>20) return ((1<<9)-1)/7 + (beg>>20); - if (beg>>23 == end>>23) return ((1<<6)-1)/7 + (beg>>23); - if (beg>>26 == end>>26) return ((1<<3)-1)/7 + (beg>>26); - return 0; -} + --end; + if (beg>>14 == end>>14) return ((1<<15)-1)/7 + (beg>>14); + if (beg>>17 == end>>17) return ((1<<12)-1)/7 + (beg>>17); + if (beg>>20 == end>>20) return ((1<<9)-1)/7 + (beg>>20); + if (beg>>23 == end>>23) return ((1<<6)-1)/7 + (beg>>23); + if (beg>>26 == end>>26) return ((1<<3)-1)/7 + (beg>>26); + return 0; +} diff --git a/src/bedToIgv/bedToIgv.cpp b/src/bedToIgv/bedToIgv.cpp index 66ce31dd8cf9987fe47db7a971b182201e6bdba6..d98e80f96f2d632a63606ae0d98462b54db13a49 100644 --- a/src/bedToIgv/bedToIgv.cpp +++ b/src/bedToIgv/bedToIgv.cpp @@ -30,7 +30,7 @@ using namespace std; // function declarations void ShowHelp(void); -void DetermineBedInput(BedFile *bed, string path, string sortType, string session, +void DetermineBedInput(BedFile *bed, string path, string sortType, string session, bool collapse, bool useNames, string imageType, int slop); void ProcessBed(istream &bedInput, BedFile *bed, string path, string sortType, string session, bool collapse, bool useNames, string imageType, int slop); @@ -38,181 +38,181 @@ void ProcessBed(istream &bedInput, BedFile *bed, string path, string sortType, s int main(int argc, char* argv[]) { - // our configuration variables - bool showHelp = false; + // our configuration variables + bool showHelp = false; - // input files - string bedFile = "stdin"; + // input files + string bedFile = "stdin"; string imagePath = "./"; string sortType = "none"; - string session = "none"; + string session = "none"; int slop = 0; string imageType = "png"; - - bool haveBed = true; - bool collapse = false; + + bool haveBed = true; + bool collapse = false; bool useNames = false; - for(int i = 1; i < argc; i++) { - int parameterLength = (int)strlen(argv[i]); - - if((PARAMETER_CHECK("-h", 2, parameterLength)) || - (PARAMETER_CHECK("--help", 5, parameterLength))) { - showHelp = true; - } - } - - if(showHelp) ShowHelp(); - - // do some parsing (all of these parameters require 2 strings) - for(int i = 1; i < argc; i++) { - - int parameterLength = (int)strlen(argv[i]); - - if(PARAMETER_CHECK("-i", 2, parameterLength)) { - if ((i+1) < argc) { - bedFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-path", 5, parameterLength)) { - if ((i+1) < argc) { - imagePath = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-sort", 5, parameterLength)) { - if ((i+1) < argc) { - sortType = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-sess", 5, parameterLength)) { - if ((i+1) < argc) { - session = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-clps", 5, parameterLength)) { + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if((PARAMETER_CHECK("-h", 2, parameterLength)) || + (PARAMETER_CHECK("--help", 5, parameterLength))) { + showHelp = true; + } + } + + if(showHelp) ShowHelp(); + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-i", 2, parameterLength)) { + if ((i+1) < argc) { + bedFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-path", 5, parameterLength)) { + if ((i+1) < argc) { + imagePath = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-sort", 5, parameterLength)) { + if ((i+1) < argc) { + sortType = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-sess", 5, parameterLength)) { + if ((i+1) < argc) { + session = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-clps", 5, parameterLength)) { collapse = true; - } - else if(PARAMETER_CHECK("-name", 5, parameterLength)) { + } + else if(PARAMETER_CHECK("-name", 5, parameterLength)) { useNames = true; - } - else if(PARAMETER_CHECK("-slop", 5, parameterLength)) { - if ((i+1) < argc) { - slop = atoi(argv[i + 1]); - i++; - } - } - else if(PARAMETER_CHECK("-img", 4, parameterLength)) { - if ((i+1) < argc) { - imageType = argv[i + 1]; - i++; - } - } - else { - cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; - showHelp = true; - } - } - - // make sure we have an input files - if (!haveBed ) { - cerr << endl << "*****" << endl << "*****ERROR: Need -i (BED) file. " << endl << "*****" << endl; - showHelp = true; - } - if (sortType != "none") { - if ((sortType != "base") && (sortType != "position") && (sortType != "strand") && - (sortType != "quality") && (sortType != "sample") && (sortType != "readGroup")) { - cerr << endl << "*****" << endl << "*****ERROR: Invalid sort option. " << endl << "*****" << endl; - showHelp = true; - } - } + } + else if(PARAMETER_CHECK("-slop", 5, parameterLength)) { + if ((i+1) < argc) { + slop = atoi(argv[i + 1]); + i++; + } + } + else if(PARAMETER_CHECK("-img", 4, parameterLength)) { + if ((i+1) < argc) { + imageType = argv[i + 1]; + i++; + } + } + else { + cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; + showHelp = true; + } + } + + // make sure we have an input files + if (!haveBed ) { + cerr << endl << "*****" << endl << "*****ERROR: Need -i (BED) file. " << endl << "*****" << endl; + showHelp = true; + } + if (sortType != "none") { + if ((sortType != "base") && (sortType != "position") && (sortType != "strand") && + (sortType != "quality") && (sortType != "sample") && (sortType != "readGroup")) { + cerr << endl << "*****" << endl << "*****ERROR: Invalid sort option. " << endl << "*****" << endl; + showHelp = true; + } + } if (slop < 0) { - cerr << endl << "*****" << endl << "*****ERROR: Slop must be >= 0. " << endl << "*****" << endl; - showHelp = true; - } - - if (!showHelp) { - BedFile *bed = new BedFile(bedFile); - DetermineBedInput(bed, imagePath, sortType, session, collapse, useNames, imageType, slop); - } - else { - ShowHelp(); - } + cerr << endl << "*****" << endl << "*****ERROR: Slop must be >= 0. " << endl << "*****" << endl; + showHelp = true; + } + + if (!showHelp) { + BedFile *bed = new BedFile(bedFile); + DetermineBedInput(bed, imagePath, sortType, session, collapse, useNames, imageType, slop); + } + else { + ShowHelp(); + } } void ShowHelp(void) { - cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; - - cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; + cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; - cerr << "Summary: Creates a batch script to create IGV images " << endl; - cerr << " at each interval defined in a BED/GFF/VCF file." << endl << endl; + cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; - cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bed/gff/vcf>" << endl << endl; + cerr << "Summary: Creates a batch script to create IGV images " << endl; + cerr << " at each interval defined in a BED/GFF/VCF file." << endl << endl; - cerr << "Options: " << endl; + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bed/gff/vcf>" << endl << endl; - cerr << "\t-path\t" << "The full path to which the IGV snapshots should be written." << endl; - cerr << "\t\t(STRING) Default: ./" << endl << endl; + cerr << "Options: " << endl; - cerr << "\t-sess\t" << "The full path to an existing IGV session file to be " << endl; - cerr << "\t\tloaded prior to taking snapshots." << endl << endl; - cerr << "\t\t(STRING) Default is for no session to be loaded." << endl << endl; - - cerr << "\t-sort\t" << "The type of BAM sorting you would like to apply to each image. " << endl; - cerr << "\t\tOptions: base, position, strand, quality, sample, and readGroup" << endl; - cerr << "\t\tDefault is to apply no sorting at all." << endl << endl; + cerr << "\t-path\t" << "The full path to which the IGV snapshots should be written." << endl; + cerr << "\t\t(STRING) Default: ./" << endl << endl; - cerr << "\t-clps\t" << "Collapse the aligned reads prior to taking a snapshot. " << endl; - cerr << "\t\tDefault is to no collapse." << endl << endl; + cerr << "\t-sess\t" << "The full path to an existing IGV session file to be " << endl; + cerr << "\t\tloaded prior to taking snapshots." << endl << endl; + cerr << "\t\t(STRING) Default is for no session to be loaded." << endl << endl; - cerr << "\t-name\t" << "Use the \"name\" field (column 4) for each image's filename. " << endl; - cerr << "\t\tDefault is to use the \"chr:start-pos.ext\"." << endl << endl; - - cerr << "\t-slop\t" << "Number of flanking base pairs on the left & right of the image." << endl; - cerr << "\t\t- (INT) Default = 0." << endl << endl; + cerr << "\t-sort\t" << "The type of BAM sorting you would like to apply to each image. " << endl; + cerr << "\t\tOptions: base, position, strand, quality, sample, and readGroup" << endl; + cerr << "\t\tDefault is to apply no sorting at all." << endl << endl; - cerr << "\t-img\t" << "The type of image to be created. " << endl; - cerr << "\t\tOptions: png, eps, svg" << endl; - cerr << "\t\tDefault is png." << endl << endl; + cerr << "\t-clps\t" << "Collapse the aligned reads prior to taking a snapshot. " << endl; + cerr << "\t\tDefault is to no collapse." << endl << endl; - cerr << "Notes: " << endl; - cerr << "\t(1) The resulting script is meant to be run from within the IGV GUI version 1.5 or later." << endl; + cerr << "\t-name\t" << "Use the \"name\" field (column 4) for each image's filename. " << endl; + cerr << "\t\tDefault is to use the \"chr:start-pos.ext\"." << endl << endl; + + cerr << "\t-slop\t" << "Number of flanking base pairs on the left & right of the image." << endl; + cerr << "\t\t- (INT) Default = 0." << endl << endl; + + cerr << "\t-img\t" << "The type of image to be created. " << endl; + cerr << "\t\tOptions: png, eps, svg" << endl; + cerr << "\t\tDefault is png." << endl << endl; + + cerr << "Notes: " << endl; + cerr << "\t(1) The resulting script is meant to be run from within the IGV GUI version 1.5 or later." << endl; cerr << "\t(2) Unless you use the -sess option, it is assumed that prior to running the script, " << endl; cerr << "\t\tyou have loaded the proper genome, tracks and data files." << endl << endl; - // end the program here - exit(1); + // end the program here + exit(1); } -void DetermineBedInput(BedFile *bed, string path, string sortType, string session, +void DetermineBedInput(BedFile *bed, string path, string sortType, string session, bool collapse, bool useNames, string imageType, int slop) { - - // dealing with a proper file - if (bed->bedFile != "stdin") { - - ifstream bedStream(bed->bedFile.c_str(), ios::in); - if ( !bedStream ) { - cerr << "Error: The requested bed file (" << bed->bedFile << ") could not be opened. Exiting!" << endl; - exit (1); - } - ProcessBed(bedStream, bed, path, sortType, session, collapse, useNames, imageType, slop); - } - // reading from stdin - else { - ProcessBed(cin, bed, path, sortType, session, collapse, useNames, imageType, slop); - } + + // dealing with a proper file + if (bed->bedFile != "stdin") { + + ifstream bedStream(bed->bedFile.c_str(), ios::in); + if ( !bedStream ) { + cerr << "Error: The requested bed file (" << bed->bedFile << ") could not be opened. Exiting!" << endl; + exit (1); + } + ProcessBed(bedStream, bed, path, sortType, session, collapse, useNames, imageType, slop); + } + // reading from stdin + else { + ProcessBed(cin, bed, path, sortType, session, collapse, useNames, imageType, slop); + } } -void ProcessBed(istream &bedInput, BedFile *bed, string path, string sortType, string session, +void ProcessBed(istream &bedInput, BedFile *bed, string path, string sortType, string session, bool collapse, bool useNames, string imageType, int slop) { // set the image path @@ -221,27 +221,27 @@ void ProcessBed(istream &bedInput, BedFile *bed, string path, string sortType, s // should we load a session if (session != "none") cout << "load " << session << endl; - - - BED bedEntry, nullBed; - int lineNum = 0; - BedLineStatus bedStatus; - - bed->Open(); - // process each BED entry and convert to an IGV request - while ((bedStatus = bed->GetNextBed(bedEntry, lineNum)) != BED_INVALID) { - if (bedStatus == BED_VALID) { - + + + BED bedEntry, nullBed; + int lineNum = 0; + BedLineStatus bedStatus; + + bed->Open(); + // process each BED entry and convert to an IGV request + while ((bedStatus = bed->GetNextBed(bedEntry, lineNum)) != BED_INVALID) { + if (bedStatus == BED_VALID) { + string filename = bedEntry.chrom + ":" + ToString(bedEntry.start) + "-" + ToString(bedEntry.end); string locus = bedEntry.chrom + ":" + ToString(bedEntry.start - slop) + "-" + ToString(bedEntry.end + slop); - + if (useNames == true) { if (bedEntry.name.empty() == false) filename = filename + "_" + bedEntry.name; else { cerr << "Error: You requested that filenames be based upon the name field. However, it appears to be empty. Exiting!" << endl; - exit (1); - } + exit (1); + } } if (slop > 0) { filename = filename + "_" + "slop" + ToString(slop); @@ -249,21 +249,21 @@ void ProcessBed(istream &bedInput, BedFile *bed, string path, string sortType, s // goto cout << "goto " << locus << endl; - // sort + // sort if (sortType != "none") cout << "sort " << sortType << endl; - + // collapse if (collapse == true) cout << "collapse" << endl; - + // snapshot cout << "snapshot " << filename << "." << imageType << endl; - + // reset - bedEntry = nullBed; - } - } - // close up - bed->Close(); + bedEntry = nullBed; + } + } + // close up + bed->Close(); } diff --git a/src/closestBed/closestBed.cpp b/src/closestBed/closestBed.cpp index 68d770fc17227bb3b037b6b817a07833fe5971cb..6eda4f952029b7c3d8fe733f6c70e30a4d1376c7 100644 --- a/src/closestBed/closestBed.cpp +++ b/src/closestBed/closestBed.cpp @@ -13,213 +13,213 @@ #include "closestBed.h" const int MAXSLOP = 256000000; // 2*MAXSLOP = 512 megabases. - // We don't want to keep looking if we - // can't find a nearby feature within 512 Mb. + // We don't want to keep looking if we + // can't find a nearby feature within 512 Mb. const int SLOPGROWTH = 2048000; /* - Constructor + Constructor */ BedClosest::BedClosest(string &bedAFile, string &bedBFile, bool forceStrand, string &tieMode, bool reportDistance) { - _bedAFile = bedAFile; - _bedBFile = bedBFile; - _forceStrand = forceStrand; - _tieMode = tieMode; + _bedAFile = bedAFile; + _bedBFile = bedBFile; + _forceStrand = forceStrand; + _tieMode = tieMode; _reportDistance = reportDistance; - _bedA = new BedFile(bedAFile); - _bedB = new BedFile(bedBFile); - - FindClosestBed(); + _bedA = new BedFile(bedAFile); + _bedB = new BedFile(bedBFile); + + FindClosestBed(); } /* - Destructor + Destructor */ BedClosest::~BedClosest(void) { } void BedClosest::FindWindowOverlaps(BED &a, vector<BED> &hits) { - - int slop = 0; // start out just looking for overlaps - // within the current bin (~128Kb) - // update the current feature's start and end + int slop = 0; // start out just looking for overlaps + // within the current bin (~128Kb) + + // update the current feature's start and end - CHRPOS aFudgeStart = 0; - CHRPOS aFudgeEnd; - int numOverlaps = 0; - vector<BED> closestB; - float maxOverlap = 0; - CHRPOS minDistance = INT_MAX; - vector<CHRPOS> distances; + CHRPOS aFudgeStart = 0; + CHRPOS aFudgeEnd; + int numOverlaps = 0; + vector<BED> closestB; + float maxOverlap = 0; + CHRPOS minDistance = INT_MAX; + vector<CHRPOS> distances; // is there at least one feature in B on the same chrom // as the current A feature? - if(_bedB->bedMap.find(a.chrom) != _bedB->bedMap.end()) { - - while ((numOverlaps == 0) && (slop <= MAXSLOP)) { - - // add some slop (starting at 0 bases) to a in hopes - // of finding a hit in B - if ((static_cast<int>(a.start) - slop) > 0) - aFudgeStart = a.start - slop; - else - aFudgeStart = 0; - - if ((static_cast<int>(a.start) + slop) < (2 * MAXSLOP)) - aFudgeEnd = a.end + slop; - else - aFudgeEnd = 2 * MAXSLOP; - - // THE HEAVY LIFTING - // search for hits with the current slop added - _bedB->FindOverlapsPerBin(a.chrom, aFudgeStart, aFudgeEnd, a.strand, hits, _forceStrand); - - vector<BED>::const_iterator h = hits.begin(); - vector<BED>::const_iterator hitsEnd = hits.end(); - for (; h != hitsEnd; ++h) { - - numOverlaps++; - - // do the actual features overlap? - int s = max(a.start, h->start); - int e = min(a.end, h->end); - int overlapBases = (e - s); // the number of overlapping bases b/w a and b - int aLength = (a.end - a.start); // the length of a in b.p. - - // there is overlap - if (s < e) { - // is there enough overlap (default ~ 1bp) - float overlap = (float) overlapBases / (float) aLength; - if ( overlap > 0 ) { - // is this hit the closest? - if (overlap > maxOverlap) { - maxOverlap = overlap; - - closestB.clear(); - closestB.push_back(*h); - distances.clear(); + if(_bedB->bedMap.find(a.chrom) != _bedB->bedMap.end()) { + + while ((numOverlaps == 0) && (slop <= MAXSLOP)) { + + // add some slop (starting at 0 bases) to a in hopes + // of finding a hit in B + if ((static_cast<int>(a.start) - slop) > 0) + aFudgeStart = a.start - slop; + else + aFudgeStart = 0; + + if ((static_cast<int>(a.start) + slop) < (2 * MAXSLOP)) + aFudgeEnd = a.end + slop; + else + aFudgeEnd = 2 * MAXSLOP; + + // THE HEAVY LIFTING + // search for hits with the current slop added + _bedB->FindOverlapsPerBin(a.chrom, aFudgeStart, aFudgeEnd, a.strand, hits, _forceStrand); + + vector<BED>::const_iterator h = hits.begin(); + vector<BED>::const_iterator hitsEnd = hits.end(); + for (; h != hitsEnd; ++h) { + + numOverlaps++; + + // do the actual features overlap? + int s = max(a.start, h->start); + int e = min(a.end, h->end); + int overlapBases = (e - s); // the number of overlapping bases b/w a and b + int aLength = (a.end - a.start); // the length of a in b.p. + + // there is overlap + if (s < e) { + // is there enough overlap (default ~ 1bp) + float overlap = (float) overlapBases / (float) aLength; + if ( overlap > 0 ) { + // is this hit the closest? + if (overlap > maxOverlap) { + maxOverlap = overlap; + + closestB.clear(); + closestB.push_back(*h); + distances.clear(); distances.push_back(0); - } - else if (overlap == maxOverlap) { - closestB.push_back(*h); - distances.push_back(0); - } - } - } - // the hit is to the "left" of A - else if (h->end < a.start){ - if ((a.start - h->end) < minDistance) { - minDistance = a.start - h->end; - - closestB.clear(); - closestB.push_back(*h); - distances.clear(); + } + else if (overlap == maxOverlap) { + closestB.push_back(*h); + distances.push_back(0); + } + } + } + // the hit is to the "left" of A + else if (h->end < a.start){ + if ((a.start - h->end) < minDistance) { + minDistance = a.start - h->end; + + closestB.clear(); + closestB.push_back(*h); + distances.clear(); + distances.push_back(minDistance); + } + else if ((a.start - h->end) == minDistance) { + closestB.push_back(*h); + distances.push_back(minDistance); + } + } + // the hit is to the "right" of A + else { + if ((h->start - a.end) < minDistance) { + minDistance = h->start - a.end; + + closestB.clear(); + closestB.push_back(*h); + distances.clear(); distances.push_back(minDistance); - } - else if ((a.start - h->end) == minDistance) { - closestB.push_back(*h); - distances.push_back(minDistance); - } - } - // the hit is to the "right" of A - else { - if ((h->start - a.end) < minDistance) { - minDistance = h->start - a.end; - - closestB.clear(); - closestB.push_back(*h); - distances.clear(); + } + else if ((h->start - a.end) == minDistance) { + closestB.push_back(*h); distances.push_back(minDistance); - } - else if ((h->start - a.end) == minDistance) { - closestB.push_back(*h); - distances.push_back(minDistance); - } - } - } - // if no overlaps were found, we'll widen the range - // by SLOPGROWTH in each direction and search again. - slop += SLOPGROWTH; - } - } - // there is no feature in B on the same chromosome as A - else { - _bedA->reportBedTab(a); - if (_reportDistance == true) { - _bedB->reportNullBedTab(); - cout << -1 << endl; + } + } + } + // if no overlaps were found, we'll widen the range + // by SLOPGROWTH in each direction and search again. + slop += SLOPGROWTH; + } + } + // there is no feature in B on the same chromosome as A + else { + _bedA->reportBedTab(a); + if (_reportDistance == true) { + _bedB->reportNullBedTab(); + cout << -1 << endl; } else - _bedB->reportNullBedNewLine(); - } + _bedB->reportNullBedNewLine(); + } // report the closest feature(s) in B to the current A feature. // obey the user's reporting request (_tieMode) - if (numOverlaps > 0) { - - if (closestB.size() == 1 || _tieMode == "first") { - _bedA->reportBedTab(a); - if (_reportDistance == true) { - _bedB->reportBedTab(closestB[0]); - cout << distances[0] << endl; + if (numOverlaps > 0) { + + if (closestB.size() == 1 || _tieMode == "first") { + _bedA->reportBedTab(a); + if (_reportDistance == true) { + _bedB->reportBedTab(closestB[0]); + cout << distances[0] << endl; } else - _bedB->reportBedNewLine(closestB[0]); - } - else { - if (_tieMode == "all") { + _bedB->reportBedNewLine(closestB[0]); + } + else { + if (_tieMode == "all") { size_t i = 0; - for (vector<BED>::iterator b = closestB.begin(); b != closestB.end(); ++b) { - _bedA->reportBedTab(a); - if (_reportDistance == true) { - _bedB->reportBedTab(*b); - cout << distances[i++] <<endl; + for (vector<BED>::iterator b = closestB.begin(); b != closestB.end(); ++b) { + _bedA->reportBedTab(a); + if (_reportDistance == true) { + _bedB->reportBedTab(*b); + cout << distances[i++] <<endl; } else - _bedB->reportBedNewLine(*b); - } - } - else if (_tieMode == "last") { - _bedA->reportBedTab(a); - if (_reportDistance == true) { - _bedB->reportBedTab(closestB[closestB.size()-1]); - cout << distances[distances.size() - 1]<<endl; + _bedB->reportBedNewLine(*b); + } + } + else if (_tieMode == "last") { + _bedA->reportBedTab(a); + if (_reportDistance == true) { + _bedB->reportBedTab(closestB[closestB.size()-1]); + cout << distances[distances.size() - 1]<<endl; } else _bedB->reportBedNewLine(closestB[closestB.size()-1]); - } - } - } + } + } + } } - + void BedClosest::FindClosestBed() { - // load the "B" bed file into a map so - // that we can easily compare "A" to it for overlaps - _bedB->loadBedFileIntoMap(); - - BED a, nullBed; - int lineNum = 0; // current input line number - vector<BED> hits; // vector of potential hits - hits.reserve(100); - BedLineStatus bedStatus; - - _bedA->Open(); - // process each entry in A in search of the closest feature in B - while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) { - if (bedStatus == BED_VALID) { - FindWindowOverlaps(a, hits); - hits.clear(); - a = nullBed; - } - } - _bedA->Close(); + // load the "B" bed file into a map so + // that we can easily compare "A" to it for overlaps + _bedB->loadBedFileIntoMap(); + + BED a, nullBed; + int lineNum = 0; // current input line number + vector<BED> hits; // vector of potential hits + hits.reserve(100); + BedLineStatus bedStatus; + + _bedA->Open(); + // process each entry in A in search of the closest feature in B + while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) { + if (bedStatus == BED_VALID) { + FindWindowOverlaps(a, hits); + hits.clear(); + a = nullBed; + } + } + _bedA->Close(); } // END ClosestBed diff --git a/src/closestBed/closestBed.h b/src/closestBed/closestBed.h index c1f4182ae381dd06ec2dad6cda4cb81c6074895d..06d77c74f12cd630b93e3eaeeae12c604ba8ca8a 100644 --- a/src/closestBed/closestBed.h +++ b/src/closestBed/closestBed.h @@ -26,29 +26,29 @@ class BedClosest { public: - // constructor - BedClosest(string &bedAFile, string &bedBFile, bool forceStrand, string &tieMode, bool reportDistance); - - // destructor - ~BedClosest(void); - - // find the closest feature in B to A - void FindClosestBed(); - + // constructor + BedClosest(string &bedAFile, string &bedBFile, bool forceStrand, string &tieMode, bool reportDistance); + + // destructor + ~BedClosest(void); + + // find the closest feature in B to A + void FindClosestBed(); + private: - - // data - string _bedAFile; - string _bedBFile; - string _tieMode; - bool _forceStrand; - bool _reportDistance; - - BedFile *_bedA, *_bedB; - - // methods - void reportNullB(); - void FindWindowOverlaps(BED &, vector<BED> &); + + // data + string _bedAFile; + string _bedBFile; + string _tieMode; + bool _forceStrand; + bool _reportDistance; + + BedFile *_bedA, *_bedB; + + // methods + void reportNullB(); + void FindWindowOverlaps(BED &, vector<BED> &); }; #endif /* CLOSEST_H */ diff --git a/src/closestBed/closestMain.cpp b/src/closestBed/closestMain.cpp index 9c397eabd022259262a86728d5133399f4da4851..bb8c6ce546c7a69aae364f492090a4e2799f241d 100644 --- a/src/closestBed/closestMain.cpp +++ b/src/closestBed/closestMain.cpp @@ -25,126 +25,126 @@ void ShowHelp(void); int main(int argc, char* argv[]) { - // our configuration variables - bool showHelp = false; - - // input files - string bedAFile; - string bedBFile; - string tieMode = "all"; - - bool haveBedA = false; - bool haveBedB = false; - bool haveTieMode = false; - bool forceStrand = false; + // our configuration variables + bool showHelp = false; + + // input files + string bedAFile; + string bedBFile; + string tieMode = "all"; + + bool haveBedA = false; + bool haveBedB = false; + bool haveTieMode = false; + bool forceStrand = false; bool reportDistance = false; - // check to see if we should print out some help - if(argc <= 1) showHelp = true; - - for(int i = 1; i < argc; i++) { - int parameterLength = (int)strlen(argv[i]); - - if( (PARAMETER_CHECK("-h", 2, parameterLength)) || - (PARAMETER_CHECK("--help", 5, parameterLength))) { - showHelp = true; - } - } - - if(showHelp) ShowHelp(); - - // do some parsing (all of these parameters require 2 strings) - for(int i = 1; i < argc; i++) { - - int parameterLength = (int)strlen(argv[i]); - - if(PARAMETER_CHECK("-a", 2, parameterLength)) { - if ((i+1) < argc) { - haveBedA = true; - bedAFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-b", 2, parameterLength)) { - if ((i+1) < argc) { - haveBedB = true; - bedBFile = argv[i + 1]; - i++; - } - } - else if (PARAMETER_CHECK("-s", 2, parameterLength)) { - forceStrand = true; - } - else if (PARAMETER_CHECK("-d", 2, parameterLength)) { - reportDistance = true; - } - else if (PARAMETER_CHECK("-t", 2, parameterLength)) { - if ((i+1) < argc) { - haveTieMode = true; - tieMode = argv[i + 1]; - i++; - } - } - } - - // make sure we have both input files - if (!haveBedA || !haveBedB) { - cerr << endl << "*****" << endl << "*****ERROR: Need -a and -b files. " << endl << "*****" << endl; - showHelp = true; - } - - if (haveTieMode && (tieMode != "all") && (tieMode != "first") - && (tieMode != "last")) { - cerr << endl << "*****" << endl << "*****ERROR: Request \"all\" or \"first\" or \"last\" for Tie Mode (-t)" << endl << "*****" << endl; - showHelp = true; - } - - if (!showHelp) { - BedClosest *bc = new BedClosest(bedAFile, bedBFile, forceStrand, tieMode, reportDistance); - delete bc; - return 0; - } - else { - ShowHelp(); - } + // check to see if we should print out some help + if(argc <= 1) showHelp = true; + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if( (PARAMETER_CHECK("-h", 2, parameterLength)) || + (PARAMETER_CHECK("--help", 5, parameterLength))) { + showHelp = true; + } + } + + if(showHelp) ShowHelp(); + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-a", 2, parameterLength)) { + if ((i+1) < argc) { + haveBedA = true; + bedAFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-b", 2, parameterLength)) { + if ((i+1) < argc) { + haveBedB = true; + bedBFile = argv[i + 1]; + i++; + } + } + else if (PARAMETER_CHECK("-s", 2, parameterLength)) { + forceStrand = true; + } + else if (PARAMETER_CHECK("-d", 2, parameterLength)) { + reportDistance = true; + } + else if (PARAMETER_CHECK("-t", 2, parameterLength)) { + if ((i+1) < argc) { + haveTieMode = true; + tieMode = argv[i + 1]; + i++; + } + } + } + + // make sure we have both input files + if (!haveBedA || !haveBedB) { + cerr << endl << "*****" << endl << "*****ERROR: Need -a and -b files. " << endl << "*****" << endl; + showHelp = true; + } + + if (haveTieMode && (tieMode != "all") && (tieMode != "first") + && (tieMode != "last")) { + cerr << endl << "*****" << endl << "*****ERROR: Request \"all\" or \"first\" or \"last\" for Tie Mode (-t)" << endl << "*****" << endl; + showHelp = true; + } + + if (!showHelp) { + BedClosest *bc = new BedClosest(bedAFile, bedBFile, forceStrand, tieMode, reportDistance); + delete bc; + return 0; + } + else { + ShowHelp(); + } } void ShowHelp(void) { - cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; - - cerr << "Authors: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; - cerr << "\t Erik Arner, Riken" << endl << endl; - - cerr << "Summary: For each feature in A, finds the closest " << endl; - cerr << "\t feature (upstream or downstream) in B." << endl << endl; - - cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <bed/gff/vcf> -b <bed/gff/vcf>" << endl << endl; - - cerr << "Options: " << endl; - cerr << "\t-s\t" << "Force strandedness. That is, find the closest feature in B" << endl; - cerr << "\t\tthat overlaps A on the same strand." << endl; - cerr << "\t\t- By default, overlaps are reported without respect to strand." << endl << endl; - - cerr << "\t-d\t" << "In addition to the closest feature in B, " << endl; - cerr << "\t\treport its distance to A as an extra column." << endl; - cerr << "\t\t- The reported distance for overlapping features will be 0." << endl << endl; - - - cerr << "\t-t\t" << "How ties for closest feature are handled. This occurs when two" << endl; - cerr << "\t\tfeatures in B have exactly the same overlap with A." << endl; - cerr << "\t\tBy default, all such features in B are reported." << endl; - cerr << "\t\tHere are all the options:" << endl; - cerr << "\t\t- \"all\" Report all ties (default)." << endl; - cerr << "\t\t- \"first\" Report the first tie that occurred in the B file." << endl; - cerr << "\t\t- \"last\" Report the last tie that occurred in the B file." << endl << endl; - - cerr << "Notes: " << endl; - cerr << "\tReports \"none\" for chrom and \"-1\" for all other fields when a feature" << endl; - cerr << "\tis not found in B on the same chromosome as the feature in A." << endl; - cerr << "\tE.g. none\t-1\t-1" << endl << endl; - - // end the program here - exit(1); + cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; + + cerr << "Authors: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; + cerr << "\t Erik Arner, Riken" << endl << endl; + + cerr << "Summary: For each feature in A, finds the closest " << endl; + cerr << "\t feature (upstream or downstream) in B." << endl << endl; + + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <bed/gff/vcf> -b <bed/gff/vcf>" << endl << endl; + + cerr << "Options: " << endl; + cerr << "\t-s\t" << "Force strandedness. That is, find the closest feature in B" << endl; + cerr << "\t\tthat overlaps A on the same strand." << endl; + cerr << "\t\t- By default, overlaps are reported without respect to strand." << endl << endl; + + cerr << "\t-d\t" << "In addition to the closest feature in B, " << endl; + cerr << "\t\treport its distance to A as an extra column." << endl; + cerr << "\t\t- The reported distance for overlapping features will be 0." << endl << endl; + + + cerr << "\t-t\t" << "How ties for closest feature are handled. This occurs when two" << endl; + cerr << "\t\tfeatures in B have exactly the same overlap with A." << endl; + cerr << "\t\tBy default, all such features in B are reported." << endl; + cerr << "\t\tHere are all the options:" << endl; + cerr << "\t\t- \"all\" Report all ties (default)." << endl; + cerr << "\t\t- \"first\" Report the first tie that occurred in the B file." << endl; + cerr << "\t\t- \"last\" Report the last tie that occurred in the B file." << endl << endl; + + cerr << "Notes: " << endl; + cerr << "\tReports \"none\" for chrom and \"-1\" for all other fields when a feature" << endl; + cerr << "\tis not found in B on the same chromosome as the feature in A." << endl; + cerr << "\tE.g. none\t-1\t-1" << endl << endl; + + // end the program here + exit(1); } diff --git a/src/complementBed/complementBed.cpp b/src/complementBed/complementBed.cpp index 33b48e3ca065dff263dd17b24fadff3691eb9dd1..f14d1d8d362790eb29e1ee823d377d4f8ea3f240 100644 --- a/src/complementBed/complementBed.cpp +++ b/src/complementBed/complementBed.cpp @@ -14,12 +14,12 @@ BedComplement::BedComplement(string &bedFile, string &genomeFile) { - _bedFile = bedFile; - _genomeFile = genomeFile; - - _bed = new BedFile(bedFile); - _genome = new GenomeFile(genomeFile); - + _bedFile = bedFile; + _genomeFile = genomeFile; + + _bed = new BedFile(bedFile); + _genome = new GenomeFile(genomeFile); + } @@ -28,61 +28,61 @@ BedComplement::~BedComplement(void) { // -// Merge overlapping BED entries into a single entry +// Merge overlapping BED entries into a single entry // void BedComplement::ComplementBed() { - // load the "B" bed file into a map so - // that we can easily compare "A" to it for overlaps - _bed->loadBedFileIntoMapNoBin(); - - vector<short> chromMasks; - string currChrom; - - // loop through each chromosome and merge their BED entries - masterBedMapNoBin::const_iterator m = _bed->bedMapNoBin.begin(); - masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end(); + // load the "B" bed file into a map so + // that we can easily compare "A" to it for overlaps + _bed->loadBedFileIntoMapNoBin(); + + vector<short> chromMasks; + string currChrom; + + // loop through each chromosome and merge their BED entries + masterBedMapNoBin::const_iterator m = _bed->bedMapNoBin.begin(); + masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end(); for (; m != mEnd; ++m) { - currChrom = m->first; - CHRPOS currChromSize = _genome->getChromSize(currChrom); - - // bedList is already sorted by start position. - vector<BED> bedList = m->second; - - // create a flag for every base on the chrom. - vector<short> chromMasks(currChromSize, 0); - - vector<BED>::const_iterator bIt = bedList.begin(); - vector<BED>::const_iterator bEnd = bedList.end(); - for ( ; bIt != bEnd; ++bIt) { - - // sanity check the end of the bed entry - if (bIt->end > currChromSize) { - cout << "End of BED entry exceeds chromosome length. Please correct." << endl; - _bed->reportBedNewLine(*bIt); - exit(1); - } - - // mask all of the positions spanned by this BED entry. - for (CHRPOS b = bIt->start; b < bIt->end; b++) - chromMasks[b] = 1; - } - - CHRPOS i = 0; - CHRPOS start; - while (i < chromMasks.size()) { - if (chromMasks[i] == 0) { - start = i; - while ((chromMasks[i] == 0) && (i < chromMasks.size())) - i++; - - if (start > 0) - cout << currChrom << "\t" << start << "\t" << i << endl; - else - cout << currChrom << "\t" << 0 << "\t" << i << endl; - } - i++; - } - } + currChrom = m->first; + CHRPOS currChromSize = _genome->getChromSize(currChrom); + + // bedList is already sorted by start position. + vector<BED> bedList = m->second; + + // create a flag for every base on the chrom. + vector<short> chromMasks(currChromSize, 0); + + vector<BED>::const_iterator bIt = bedList.begin(); + vector<BED>::const_iterator bEnd = bedList.end(); + for ( ; bIt != bEnd; ++bIt) { + + // sanity check the end of the bed entry + if (bIt->end > currChromSize) { + cout << "End of BED entry exceeds chromosome length. Please correct." << endl; + _bed->reportBedNewLine(*bIt); + exit(1); + } + + // mask all of the positions spanned by this BED entry. + for (CHRPOS b = bIt->start; b < bIt->end; b++) + chromMasks[b] = 1; + } + + CHRPOS i = 0; + CHRPOS start; + while (i < chromMasks.size()) { + if (chromMasks[i] == 0) { + start = i; + while ((chromMasks[i] == 0) && (i < chromMasks.size())) + i++; + + if (start > 0) + cout << currChrom << "\t" << start << "\t" << i << endl; + else + cout << currChrom << "\t" << 0 << "\t" << i << endl; + } + i++; + } + } } diff --git a/src/complementBed/complementBed.h b/src/complementBed/complementBed.h index bdd958e0611a2fb5d697ee1efd03b5b1d9e09139..a04cc964ec3f2bd6a59178fddc68aae232951cf7 100644 --- a/src/complementBed/complementBed.h +++ b/src/complementBed/complementBed.h @@ -29,7 +29,7 @@ class BedComplement { public: - // constructor + // constructor BedComplement(string &bedFile, string &genomeFile); // destructor @@ -38,9 +38,9 @@ public: void ComplementBed(); private: - - string _bedFile; - string _genomeFile; - BedFile *_bed; - GenomeFile *_genome; + + string _bedFile; + string _genomeFile; + BedFile *_bed; + GenomeFile *_genome; }; diff --git a/src/complementBed/complementMain.cpp b/src/complementBed/complementMain.cpp index dc3116c0b49f0d75964b4069466f0cc2359fa290..4087ce0dfea9cc83ff4efff2ade22318c30e44ec 100644 --- a/src/complementBed/complementMain.cpp +++ b/src/complementBed/complementMain.cpp @@ -26,91 +26,91 @@ void ShowHelp(void); int main(int argc, char* argv[]) { - // our configuration variables - bool showHelp = false; - - // input files - string bedFile = "stdin"; - string genomeFile; - - bool haveBed = true; - bool haveGenome = false; - - for(int i = 1; i < argc; i++) { - int parameterLength = (int)strlen(argv[i]); - - if((PARAMETER_CHECK("-h", 2, parameterLength)) || - (PARAMETER_CHECK("--help", 5, parameterLength))) { - showHelp = true; - } - } - - if(showHelp) ShowHelp(); - - // do some parsing (all of these parameters require 2 strings) - for(int i = 1; i < argc; i++) { - - int parameterLength = (int)strlen(argv[i]); - - if(PARAMETER_CHECK("-i", 2, parameterLength)) { - if ((i+1) < argc) { - bedFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-g", 2, parameterLength)) { - if ((i+1) < argc) { - haveGenome = true; - genomeFile = argv[i + 1]; - i++; - } - } - else { - cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; - showHelp = true; - } - } - - // make sure we have both input files - if (!haveBed || !haveGenome) { - cerr << endl << "*****" << endl << "*****ERROR: Need -i BED file and -g Genome file. " << endl << "*****" << endl; - showHelp = true; - } - if (!showHelp) { - BedComplement *bc = new BedComplement(bedFile, genomeFile); - bc->ComplementBed(); - return 0; - } - else { - ShowHelp(); - } + // our configuration variables + bool showHelp = false; + + // input files + string bedFile = "stdin"; + string genomeFile; + + bool haveBed = true; + bool haveGenome = false; + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if((PARAMETER_CHECK("-h", 2, parameterLength)) || + (PARAMETER_CHECK("--help", 5, parameterLength))) { + showHelp = true; + } + } + + if(showHelp) ShowHelp(); + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-i", 2, parameterLength)) { + if ((i+1) < argc) { + bedFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-g", 2, parameterLength)) { + if ((i+1) < argc) { + haveGenome = true; + genomeFile = argv[i + 1]; + i++; + } + } + else { + cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; + showHelp = true; + } + } + + // make sure we have both input files + if (!haveBed || !haveGenome) { + cerr << endl << "*****" << endl << "*****ERROR: Need -i BED file and -g Genome file. " << endl << "*****" << endl; + showHelp = true; + } + if (!showHelp) { + BedComplement *bc = new BedComplement(bedFile, genomeFile); + bc->ComplementBed(); + return 0; + } + else { + ShowHelp(); + } } void ShowHelp(void) { - - cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; - - cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; - - cerr << "Summary: Returns the base pair complement of a feature file." << endl << endl; - - cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bed/gff/vcf> -g <genome>" << endl << endl; - - cerr << "Notes: " << endl; - cerr << "\t(1) The genome file should tab delimited and structured as follows:" << endl; - cerr << "\t <chromName><TAB><chromSize>" << endl << endl; - cerr << "\tFor example, Human (hg19):" << endl; - cerr << "\tchr1\t249250621" << endl; - cerr << "\tchr2\t243199373" << endl; - cerr << "\t..." << endl; - cerr << "\tchr18_gl000207_random\t4262" << endl << endl; - - cerr << "Tips: " << endl; - cerr << "\tOne can use the UCSC Genome Browser's MySQL database to extract" << endl; - cerr << "\tchromosome sizes. For example, H. sapiens:" << endl << endl; - cerr << "\tmysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e /" << endl; - cerr << "\t\"select chrom, size from hg19.chromInfo\" > hg19.genome" << endl << endl; - - exit(1); - + + cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; + + cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; + + cerr << "Summary: Returns the base pair complement of a feature file." << endl << endl; + + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bed/gff/vcf> -g <genome>" << endl << endl; + + cerr << "Notes: " << endl; + cerr << "\t(1) The genome file should tab delimited and structured as follows:" << endl; + cerr << "\t <chromName><TAB><chromSize>" << endl << endl; + cerr << "\tFor example, Human (hg19):" << endl; + cerr << "\tchr1\t249250621" << endl; + cerr << "\tchr2\t243199373" << endl; + cerr << "\t..." << endl; + cerr << "\tchr18_gl000207_random\t4262" << endl << endl; + + cerr << "Tips: " << endl; + cerr << "\tOne can use the UCSC Genome Browser's MySQL database to extract" << endl; + cerr << "\tchromosome sizes. For example, H. sapiens:" << endl << endl; + cerr << "\tmysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e /" << endl; + cerr << "\t\"select chrom, size from hg19.chromInfo\" > hg19.genome" << endl << endl; + + exit(1); + } diff --git a/src/coverageBed/coverageBed.cpp b/src/coverageBed/coverageBed.cpp index fd69d9b45a94d83c868d5e021c9d06b3c0d9e1ae..df1a224591ced7654d0f499a2475342664d386b7 100644 --- a/src/coverageBed/coverageBed.cpp +++ b/src/coverageBed/coverageBed.cpp @@ -13,222 +13,222 @@ #include "coverageBed.h" // build -BedCoverage::BedCoverage(string &bedAFile, string &bedBFile, bool forceStrand, +BedCoverage::BedCoverage(string &bedAFile, string &bedBFile, bool forceStrand, bool writeHistogram, bool bamInput, bool obeySplits, bool eachBase) { - - _bedAFile = bedAFile; - _bedBFile = bedBFile; - - _bedA = new BedFile(bedAFile); - _bedB = new BedFile(bedBFile); - - _forceStrand = forceStrand; + + _bedAFile = bedAFile; + _bedBFile = bedBFile; + + _bedA = new BedFile(bedAFile); + _bedB = new BedFile(bedBFile); + + _forceStrand = forceStrand; _obeySplits = obeySplits; _eachBase = eachBase; - _writeHistogram = writeHistogram; - _bamInput = bamInput; - - - if (_bamInput == false) - CollectCoverageBed(); - else - CollectCoverageBam(_bedA->bedFile); + _writeHistogram = writeHistogram; + _bamInput = bamInput; + + + if (_bamInput == false) + CollectCoverageBed(); + else + CollectCoverageBam(_bedA->bedFile); } // destroy BedCoverage::~BedCoverage(void) { - delete _bedA; - delete _bedB; + delete _bedA; + delete _bedB; } void BedCoverage::CollectCoverageBed() { - - // load the "B" bed file into a map so - // that we can easily compare "A" to it for overlaps - _bedB->loadBedCovFileIntoMap(); - - int lineNum = 0; // current input line number - BED a, nullBed; - BedLineStatus bedStatus; - - _bedA->Open(); - // process each entry in A - while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) { - if (bedStatus == BED_VALID) { - // process the BED entry as a single block + + // load the "B" bed file into a map so + // that we can easily compare "A" to it for overlaps + _bedB->loadBedCovFileIntoMap(); + + int lineNum = 0; // current input line number + BED a, nullBed; + BedLineStatus bedStatus; + + _bedA->Open(); + // process each entry in A + while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) { + if (bedStatus == BED_VALID) { + // process the BED entry as a single block if (_obeySplits == false) - _bedB->countHits(a, _forceStrand); - // split the BED into discrete blocksand process each independently. - else { - bedVector bedBlocks; + _bedB->countHits(a, _forceStrand); + // split the BED into discrete blocksand process each independently. + else { + bedVector bedBlocks; splitBedIntoBlocks(a, lineNum, bedBlocks); - - // use countSplitHits to avoid over-counting each split chunk - // as distinct read coverage. - _bedB->countSplitHits(bedBlocks, _forceStrand); - } - a = nullBed; - } - } - _bedA->Close(); - - // report the coverage (summary or histogram) for BED B. - ReportCoverage(); + + // use countSplitHits to avoid over-counting each split chunk + // as distinct read coverage. + _bedB->countSplitHits(bedBlocks, _forceStrand); + } + a = nullBed; + } + } + _bedA->Close(); + + // report the coverage (summary or histogram) for BED B. + ReportCoverage(); } void BedCoverage::CollectCoverageBam(string bamFile) { - // load the "B" bed file into a map so - // that we can easily compare "A" to it for overlaps - _bedB->loadBedCovFileIntoMap(); - - // open the BAM file - BamReader reader; - reader.Open(bamFile); - - // get header & reference information - string header = reader.GetHeaderText(); - RefVector refs = reader.GetReferenceData(); - - // convert each aligned BAM entry to BED - // and compute coverage on B - BamAlignment bam; - while (reader.GetNextAlignment(bam)) { - if (bam.IsMapped()) { - // treat the BAM alignment as a single "block" - if (_obeySplits == false) { - // construct a new BED entry from the current BAM alignment. - BED a; - a.chrom = refs.at(bam.RefID).RefName; - a.start = bam.Position; - a.end = bam.GetEndPosition(false); - a.strand = "+"; - if (bam.IsReverseStrand()) a.strand = "-"; - - _bedB->countHits(a, _forceStrand); - } - // split the BAM alignment into discrete blocks and - // look for overlaps only within each block. - else { - // vec to store the discrete BED "blocks" from a - bedVector bedBlocks; - // since we are counting coverage, we do want to split blocks when a + // load the "B" bed file into a map so + // that we can easily compare "A" to it for overlaps + _bedB->loadBedCovFileIntoMap(); + + // open the BAM file + BamReader reader; + reader.Open(bamFile); + + // get header & reference information + string header = reader.GetHeaderText(); + RefVector refs = reader.GetReferenceData(); + + // convert each aligned BAM entry to BED + // and compute coverage on B + BamAlignment bam; + while (reader.GetNextAlignment(bam)) { + if (bam.IsMapped()) { + // treat the BAM alignment as a single "block" + if (_obeySplits == false) { + // construct a new BED entry from the current BAM alignment. + BED a; + a.chrom = refs.at(bam.RefID).RefName; + a.start = bam.Position; + a.end = bam.GetEndPosition(false); + a.strand = "+"; + if (bam.IsReverseStrand()) a.strand = "-"; + + _bedB->countHits(a, _forceStrand); + } + // split the BAM alignment into discrete blocks and + // look for overlaps only within each block. + else { + // vec to store the discrete BED "blocks" from a + bedVector bedBlocks; + // since we are counting coverage, we do want to split blocks when a // deletion (D) CIGAR op is encountered (hence the true for the last parm) - getBamBlocks(bam, refs, bedBlocks, true); - // use countSplitHits to avoid over-counting each split chunk - // as distinct read coverage. - _bedB->countSplitHits(bedBlocks, _forceStrand); - } - } - } - // report the coverage (summary or histogram) for BED B. - ReportCoverage(); - // close the BAM file - reader.Close(); + getBamBlocks(bam, refs, bedBlocks, true); + // use countSplitHits to avoid over-counting each split chunk + // as distinct read coverage. + _bedB->countSplitHits(bedBlocks, _forceStrand); + } + } + } + // report the coverage (summary or histogram) for BED B. + ReportCoverage(); + // close the BAM file + reader.Close(); } void BedCoverage::ReportCoverage() { - map<unsigned int, unsigned int> allDepthHist; - unsigned int totalLength = 0; - - // process each chromosome - masterBedCovMap::const_iterator chromItr = _bedB->bedCovMap.begin(); - masterBedCovMap::const_iterator chromEnd = _bedB->bedCovMap.end(); - for (; chromItr != chromEnd; ++chromItr) - { - // for each chrom, process each bin - binsToBedCovs::const_iterator binItr = chromItr->second.begin(); - binsToBedCovs::const_iterator binEnd = chromItr->second.end(); - for (; binItr != binEnd; ++binItr) - { - // for each chrom & bin, compute and report + map<unsigned int, unsigned int> allDepthHist; + unsigned int totalLength = 0; + + // process each chromosome + masterBedCovMap::const_iterator chromItr = _bedB->bedCovMap.begin(); + masterBedCovMap::const_iterator chromEnd = _bedB->bedCovMap.end(); + for (; chromItr != chromEnd; ++chromItr) + { + // for each chrom, process each bin + binsToBedCovs::const_iterator binItr = chromItr->second.begin(); + binsToBedCovs::const_iterator binEnd = chromItr->second.end(); + for (; binItr != binEnd; ++binItr) + { + // for each chrom & bin, compute and report // the observed coverage for each feature - vector<BEDCOV>::const_iterator bedItr = binItr->second.begin(); - vector<BEDCOV>::const_iterator bedEnd = binItr->second.end(); - for (; bedItr != bedEnd; ++bedItr) - { - int zeroDepthCount = 0; // number of bases with zero depth - int depth = 0; // tracks the depth at the current base - - // the start is either the first base in the feature OR - // the leftmost position of an overlapping feature. e.g. (s = start): - // A ---------- - // B s ------------ - int start = min(bedItr->minOverlapStart, bedItr->start); - - // track the numnber of bases in the feature covered by - // 0, 1, 2, ... n features in A - map<unsigned int, unsigned int> depthHist; - map<unsigned int, DEPTH>::const_iterator depthItr; - - // compute the coverage observed at each base in the feature marching from start to end. - for (CHRPOS pos = start+1; pos <= bedItr->end; pos++) - { - // map pointer grabbing the starts and ends observed at this position - depthItr = bedItr->depthMap.find(pos); - // increment coverage if starts observed at this position. - if (depthItr != bedItr->depthMap.end()) - depth += depthItr->second.starts; - // update coverage assuming the current position is within the current B feature - if ((pos > bedItr->start) && (pos <= bedItr->end)) { - if (depth == 0) zeroDepthCount++; - // update our histograms, assuming we are not reporting "per-base" coverage. - if (_eachBase == false) { - depthHist[depth]++; - allDepthHist[depth]++; - } - else { - _bedB->reportBedTab(*bedItr); - printf("%d\t%d\n", pos-bedItr->start, depth); - } - } - // decrement coverage if ends observed at this position. - if (depthItr != bedItr->depthMap.end()) - depth = depth - depthItr->second.ends; - } - - // Summarize the coverage for the current interval, - // assuming the user has not requested "per-base" coverage. + vector<BEDCOV>::const_iterator bedItr = binItr->second.begin(); + vector<BEDCOV>::const_iterator bedEnd = binItr->second.end(); + for (; bedItr != bedEnd; ++bedItr) + { + int zeroDepthCount = 0; // number of bases with zero depth + int depth = 0; // tracks the depth at the current base + + // the start is either the first base in the feature OR + // the leftmost position of an overlapping feature. e.g. (s = start): + // A ---------- + // B s ------------ + int start = min(bedItr->minOverlapStart, bedItr->start); + + // track the numnber of bases in the feature covered by + // 0, 1, 2, ... n features in A + map<unsigned int, unsigned int> depthHist; + map<unsigned int, DEPTH>::const_iterator depthItr; + + // compute the coverage observed at each base in the feature marching from start to end. + for (CHRPOS pos = start+1; pos <= bedItr->end; pos++) + { + // map pointer grabbing the starts and ends observed at this position + depthItr = bedItr->depthMap.find(pos); + // increment coverage if starts observed at this position. + if (depthItr != bedItr->depthMap.end()) + depth += depthItr->second.starts; + // update coverage assuming the current position is within the current B feature + if ((pos > bedItr->start) && (pos <= bedItr->end)) { + if (depth == 0) zeroDepthCount++; + // update our histograms, assuming we are not reporting "per-base" coverage. + if (_eachBase == false) { + depthHist[depth]++; + allDepthHist[depth]++; + } + else { + _bedB->reportBedTab(*bedItr); + printf("%d\t%d\n", pos-bedItr->start, depth); + } + } + // decrement coverage if ends observed at this position. + if (depthItr != bedItr->depthMap.end()) + depth = depth - depthItr->second.ends; + } + + // Summarize the coverage for the current interval, + // assuming the user has not requested "per-base" coverage. if (_eachBase == false) { - CHRPOS length = bedItr->end - bedItr->start; - totalLength += length; - int nonZeroBases = (length - zeroDepthCount); - float fractCovered = (float) nonZeroBases / length; - - // print a summary of the coverage - if (_writeHistogram == false) { - _bedB->reportBedTab(*bedItr); - printf("%d\t%d\t%d\t%0.7f\n", bedItr->count, nonZeroBases, length, fractCovered); - } - // report the number of bases with coverage == x - else { - map<unsigned int, unsigned int>::const_iterator histItr = depthHist.begin(); - map<unsigned int, unsigned int>::const_iterator histEnd = depthHist.end(); - for (; histItr != histEnd; ++histItr) - { - float fractAtThisDepth = (float) histItr->second / length; - _bedB->reportBedTab(*bedItr); - printf("%d\t%d\t%d\t%0.7f\n", histItr->first, histItr->second, length, fractAtThisDepth); - } - } - } - } - } - } - // report a histogram of coverage among _all_ - // features in B. - if (_writeHistogram == true) { - map<unsigned int, unsigned int>::const_iterator histItr = allDepthHist.begin(); - map<unsigned int, unsigned int>::const_iterator histEnd = allDepthHist.end(); - for (; histItr != histEnd; ++histItr) { - float fractAtThisDepth = (float) histItr->second / totalLength; - printf("all\t%d\t%d\t%d\t%0.7f\n", histItr->first, histItr->second, totalLength, fractAtThisDepth); - } - } + CHRPOS length = bedItr->end - bedItr->start; + totalLength += length; + int nonZeroBases = (length - zeroDepthCount); + float fractCovered = (float) nonZeroBases / length; + + // print a summary of the coverage + if (_writeHistogram == false) { + _bedB->reportBedTab(*bedItr); + printf("%d\t%d\t%d\t%0.7f\n", bedItr->count, nonZeroBases, length, fractCovered); + } + // report the number of bases with coverage == x + else { + map<unsigned int, unsigned int>::const_iterator histItr = depthHist.begin(); + map<unsigned int, unsigned int>::const_iterator histEnd = depthHist.end(); + for (; histItr != histEnd; ++histItr) + { + float fractAtThisDepth = (float) histItr->second / length; + _bedB->reportBedTab(*bedItr); + printf("%d\t%d\t%d\t%0.7f\n", histItr->first, histItr->second, length, fractAtThisDepth); + } + } + } + } + } + } + // report a histogram of coverage among _all_ + // features in B. + if (_writeHistogram == true) { + map<unsigned int, unsigned int>::const_iterator histItr = allDepthHist.begin(); + map<unsigned int, unsigned int>::const_iterator histEnd = allDepthHist.end(); + for (; histItr != histEnd; ++histItr) { + float fractAtThisDepth = (float) histItr->second / totalLength; + printf("all\t%d\t%d\t%d\t%0.7f\n", histItr->first, histItr->second, totalLength, fractAtThisDepth); + } + } } diff --git a/src/coverageBed/coverageBed.h b/src/coverageBed/coverageBed.h index 3f192e392b410052bf1eaaad608947bcff66b83d..e5b33e0bb903f95c9994b9996935a1dc55fee4b5 100644 --- a/src/coverageBed/coverageBed.h +++ b/src/coverageBed/coverageBed.h @@ -9,7 +9,7 @@ Licenced under the GNU General Public License 2.0 license. ******************************************************************************/ -#ifndef COVERAGEBED_H +#ifndef COVERAGEBED_H #define COVERAGEBED_H #include "bedFile.h" @@ -35,42 +35,42 @@ class BedCoverage { public: - // constructor - BedCoverage(string &bedAFile, string &bedBFile, bool forceStrand, bool writeHistogram, - bool bamInput, bool obeySplits, bool eachBase); + // constructor + BedCoverage(string &bedAFile, string &bedBFile, bool forceStrand, bool writeHistogram, + bool bamInput, bool obeySplits, bool eachBase); + + // destructor + ~BedCoverage(void); - // destructor - ~BedCoverage(void); - private: - // input files. - string _bedAFile; - string _bedBFile; - - // instance of a bed file class. - BedFile *_bedA, *_bedB; - - // do we care about strandedness when counting coverage? - bool _forceStrand; - - // should we write a histogram for each feature in B? - bool _writeHistogram; - - // are we dealing with BAM input for "A"? - bool _bamInput; - - // should we split BED/BAM into discrete blocks? + // input files. + string _bedAFile; + string _bedBFile; + + // instance of a bed file class. + BedFile *_bedA, *_bedB; + + // do we care about strandedness when counting coverage? + bool _forceStrand; + + // should we write a histogram for each feature in B? + bool _writeHistogram; + + // are we dealing with BAM input for "A"? + bool _bamInput; + + // should we split BED/BAM into discrete blocks? bool _obeySplits; - + // should discrete coverage be reported for each base in each feature? bool _eachBase; - - // private function for reporting coverage information - void ReportCoverage(); - - void CollectCoverageBed(); - void CollectCoverageBam(string bamFile); + // private function for reporting coverage information + void ReportCoverage(); + + void CollectCoverageBed(); + + void CollectCoverageBam(string bamFile); }; #endif /* COVERAGEBED_H */ diff --git a/src/coverageBed/coverageMain.cpp b/src/coverageBed/coverageMain.cpp index c04d257c6bab91fd56084e70aa02ff6a4426123b..7b687af5cdcc59dd91e4f114003af3d164d3fa67 100644 --- a/src/coverageBed/coverageMain.cpp +++ b/src/coverageBed/coverageMain.cpp @@ -25,138 +25,138 @@ void ShowHelp(void); int main(int argc, char* argv[]) { - // our configuration variables - bool showHelp = false; - - // input files - string bedAFile; - string bedBFile; - - // parm flags - bool forceStrand = false; - bool writeHistogram = false; - bool eachBase = false; + // our configuration variables + bool showHelp = false; + + // input files + string bedAFile; + string bedBFile; + + // parm flags + bool forceStrand = false; + bool writeHistogram = false; + bool eachBase = false; bool obeySplits = false; - bool bamInput = false; - bool haveBedA = false; - bool haveBedB = false; - - // check to see if we should print out some help - if(argc <= 1) showHelp = true; - - for(int i = 1; i < argc; i++) { - int parameterLength = (int)strlen(argv[i]); - - if((PARAMETER_CHECK("-h", 2, parameterLength)) || - (PARAMETER_CHECK("--help", 5, parameterLength))) { - showHelp = true; - } - } - - if(showHelp) ShowHelp(); - - // do some parsing (all of these parameters require 2 strings) - for(int i = 1; i < argc; i++) { - - int parameterLength = (int)strlen(argv[i]); - - if(PARAMETER_CHECK("-a", 2, parameterLength)) { - if ((i+1) < argc) { - haveBedA = true; - bedAFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-abam", 5, parameterLength)) { - if ((i+1) < argc) { - haveBedA = true; - bamInput = true; - bedAFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-b", 2, parameterLength)) { - if ((i+1) < argc) { - haveBedB = true; - bedBFile = argv[i + 1]; - i++; - } - } - else if (PARAMETER_CHECK("-s", 2, parameterLength)) { - forceStrand = true; - } - else if (PARAMETER_CHECK("-hist", 5, parameterLength)) { - writeHistogram = true; - } - else if(PARAMETER_CHECK("-d", 2, parameterLength)) { - eachBase = true; - } - else if (PARAMETER_CHECK("-split", 6, parameterLength)) { - obeySplits = true; - } - else { - cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; - showHelp = true; - } - } - - // make sure we have both input files - if (!haveBedA || !haveBedB) { - cerr << endl << "*****" << endl << "*****ERROR: Need -a and -b files. " << endl << "*****" << endl; - showHelp = true; - } - - if (!showHelp) { - BedCoverage *bg = new BedCoverage(bedAFile, bedBFile, forceStrand, writeHistogram, bamInput, obeySplits, eachBase); - delete bg; - return 0; - } - else { - ShowHelp(); - } + bool bamInput = false; + bool haveBedA = false; + bool haveBedB = false; + + // check to see if we should print out some help + if(argc <= 1) showHelp = true; + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if((PARAMETER_CHECK("-h", 2, parameterLength)) || + (PARAMETER_CHECK("--help", 5, parameterLength))) { + showHelp = true; + } + } + + if(showHelp) ShowHelp(); + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-a", 2, parameterLength)) { + if ((i+1) < argc) { + haveBedA = true; + bedAFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-abam", 5, parameterLength)) { + if ((i+1) < argc) { + haveBedA = true; + bamInput = true; + bedAFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-b", 2, parameterLength)) { + if ((i+1) < argc) { + haveBedB = true; + bedBFile = argv[i + 1]; + i++; + } + } + else if (PARAMETER_CHECK("-s", 2, parameterLength)) { + forceStrand = true; + } + else if (PARAMETER_CHECK("-hist", 5, parameterLength)) { + writeHistogram = true; + } + else if(PARAMETER_CHECK("-d", 2, parameterLength)) { + eachBase = true; + } + else if (PARAMETER_CHECK("-split", 6, parameterLength)) { + obeySplits = true; + } + else { + cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; + showHelp = true; + } + } + + // make sure we have both input files + if (!haveBedA || !haveBedB) { + cerr << endl << "*****" << endl << "*****ERROR: Need -a and -b files. " << endl << "*****" << endl; + showHelp = true; + } + + if (!showHelp) { + BedCoverage *bg = new BedCoverage(bedAFile, bedBFile, forceStrand, writeHistogram, bamInput, obeySplits, eachBase); + delete bg; + return 0; + } + else { + ShowHelp(); + } } void ShowHelp(void) { - - cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; - - cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; - - cerr << "Summary: Returns the depth and breadth of coverage of features from A" << endl; - cerr << "\t on the intervals in B." << endl << endl; - - cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <bed/gff/vcf> -b <bed/gff/vcf>" << endl << endl; - - cerr << "Options: " << endl; - - cerr << "\t-abam\t" << "The A input file is in BAM format." << endl << endl; - - cerr << "\t-s\t" << "Force strandedness. That is, only include hits in A that" << endl; - cerr << "\t\toverlap B on the same strand." << endl; - cerr << "\t\t- By default, hits are included without respect to strand." << endl << endl; - - cerr << "\t-hist\t" << "Report a histogram of coverage for each feature in B" << endl; - cerr << "\t\tas well as a summary histogram for _all_ features in B." << endl << endl; - cerr << "\t\tOutput (tab delimited) after each feature in B:" << endl; - cerr << "\t\t 1) depth\n\t\t 2) # bases at depth\n\t\t 3) size of B\n\t\t 4) % of B at depth" << endl << endl; - - cerr << "\t-d\t" << "Report the depth at each position in each B feature." << endl; - cerr << "\t\tPositions reported are one based. Each position" << endl; - cerr << "\t\tand depth follow the complete B feature." << endl << endl; - - cerr << "\t-split\t" << "Treat \"split\" BAM or BED12 entries as distinct BED intervals." << endl; - cerr << "\t\twhen computing coverage." << endl; - cerr << "\t\tFor BAM files, this uses the CIGAR \"N\" and \"D\" operations " << endl; - cerr << "\t\tto infer the blocks for computing coverage." << endl; - cerr << "\t\tFor BED12 files, this uses the BlockCount, BlockStarts," << endl; - cerr << "\t\tand BlockEnds fields (i.e., columns 10,11,12)." << endl << endl; - - cerr << "Default Output: " << endl; - cerr << "\t" << " After each entry in B, reports: " << endl; - cerr << "\t 1) The number of features in A that overlapped the B interval." << endl; - cerr << "\t 2) The number of bases in B that had non-zero coverage." << endl; - cerr << "\t 3) The length of the entry in B." << endl; - cerr << "\t 4) The fraction of bases in B that had non-zero coverage." << endl << endl; - - exit(1); + + cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; + + cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; + + cerr << "Summary: Returns the depth and breadth of coverage of features from A" << endl; + cerr << "\t on the intervals in B." << endl << endl; + + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <bed/gff/vcf> -b <bed/gff/vcf>" << endl << endl; + + cerr << "Options: " << endl; + + cerr << "\t-abam\t" << "The A input file is in BAM format." << endl << endl; + + cerr << "\t-s\t" << "Force strandedness. That is, only include hits in A that" << endl; + cerr << "\t\toverlap B on the same strand." << endl; + cerr << "\t\t- By default, hits are included without respect to strand." << endl << endl; + + cerr << "\t-hist\t" << "Report a histogram of coverage for each feature in B" << endl; + cerr << "\t\tas well as a summary histogram for _all_ features in B." << endl << endl; + cerr << "\t\tOutput (tab delimited) after each feature in B:" << endl; + cerr << "\t\t 1) depth\n\t\t 2) # bases at depth\n\t\t 3) size of B\n\t\t 4) % of B at depth" << endl << endl; + + cerr << "\t-d\t" << "Report the depth at each position in each B feature." << endl; + cerr << "\t\tPositions reported are one based. Each position" << endl; + cerr << "\t\tand depth follow the complete B feature." << endl << endl; + + cerr << "\t-split\t" << "Treat \"split\" BAM or BED12 entries as distinct BED intervals." << endl; + cerr << "\t\twhen computing coverage." << endl; + cerr << "\t\tFor BAM files, this uses the CIGAR \"N\" and \"D\" operations " << endl; + cerr << "\t\tto infer the blocks for computing coverage." << endl; + cerr << "\t\tFor BED12 files, this uses the BlockCount, BlockStarts," << endl; + cerr << "\t\tand BlockEnds fields (i.e., columns 10,11,12)." << endl << endl; + + cerr << "Default Output: " << endl; + cerr << "\t" << " After each entry in B, reports: " << endl; + cerr << "\t 1) The number of features in A that overlapped the B interval." << endl; + cerr << "\t 2) The number of bases in B that had non-zero coverage." << endl; + cerr << "\t 3) The length of the entry in B." << endl; + cerr << "\t 4) The fraction of bases in B that had non-zero coverage." << endl << endl; + + exit(1); } diff --git a/src/fastaFromBed/fastaFromBed.cpp b/src/fastaFromBed/fastaFromBed.cpp index 9b70f9cf29837390a8d7386bd25e3dec14c1420b..bc4a5a6d2abf5f6ff9fecd3ec62240ca08c94747 100644 --- a/src/fastaFromBed/fastaFromBed.cpp +++ b/src/fastaFromBed/fastaFromBed.cpp @@ -13,40 +13,40 @@ #include "fastaFromBed.h" -Bed2Fa::Bed2Fa(bool &useName, string &dbFile, string &bedFile, +Bed2Fa::Bed2Fa(bool &useName, string &dbFile, string &bedFile, string &fastaOutFile, bool &useFasta, bool &useStrand) { - if (useName) { - _useName = true; - } - - _dbFile = dbFile; - _bedFile = bedFile; - _fastaOutFile = fastaOutFile; - _useFasta = useFasta; - _useStrand = useStrand; - - _bed = new BedFile(_bedFile); - - // Figure out what the output file should be. - if (fastaOutFile == "stdout") { + if (useName) { + _useName = true; + } + + _dbFile = dbFile; + _bedFile = bedFile; + _fastaOutFile = fastaOutFile; + _useFasta = useFasta; + _useStrand = useStrand; + + _bed = new BedFile(_bedFile); + + // Figure out what the output file should be. + if (fastaOutFile == "stdout") { _faOut = &cout; } else { - // Make sure we can open the file. - ofstream fa(fastaOutFile.c_str(), ios::out); - if ( !fa ) { - cerr << "Error: The requested fasta output file (" << fastaOutFile << ") could not be opened. Exiting!" << endl; - exit (1); - } - else { - fa.close(); - _faOut = new ofstream(fastaOutFile.c_str(), ios::out); - } - } - - // Extract the requested intervals from the FASTA input file. - ExtractDNA(); + // Make sure we can open the file. + ofstream fa(fastaOutFile.c_str(), ios::out); + if ( !fa ) { + cerr << "Error: The requested fasta output file (" << fastaOutFile << ") could not be opened. Exiting!" << endl; + exit (1); + } + else { + fa.close(); + _faOut = new ofstream(fastaOutFile.c_str(), ios::out); + } + } + + // Extract the requested intervals from the FASTA input file. + ExtractDNA(); } @@ -58,37 +58,37 @@ Bed2Fa::~Bed2Fa(void) { // ReportDNA //****************************************************************************** void Bed2Fa::ReportDNA(const BED &bed, const string &currDNA, const string &currChrom) { - - if ( (bed.start <= currDNA.size()) && (bed.end <= currDNA.size()) ) { - - string dna = currDNA.substr(bed.start, ((bed.end - bed.start))); - // revcomp if necessary. Thanks to Thomas Doktor. - if ((_useStrand == true) && (bed.strand == "-")) - reverseComplement(dna); - - if (!(_useName)) { - if (_useFasta == true) { - if (_useStrand == true) - *_faOut << ">" << currChrom << ":" << bed.start << "-" << bed.end << "(" << bed.strand << ")" << endl << dna << endl; - else - *_faOut << ">" << currChrom << ":" << bed.start << "-" << bed.end << endl << dna << endl; - } - else { - if (_useStrand == true) - *_faOut << currChrom << ":" << bed.start << "-" << bed.end << "(" << bed.strand << ")" << "\t" << dna << endl; - else - *_faOut << currChrom << ":" << bed.start << "-" << bed.end << "\t" << dna << endl; - } - } - else { - if (_useFasta == true) - *_faOut << ">" << bed.name << endl << dna << endl; - else - *_faOut << bed.name << "\t" << dna << endl; - } + + if ( (bed.start <= currDNA.size()) && (bed.end <= currDNA.size()) ) { + + string dna = currDNA.substr(bed.start, ((bed.end - bed.start))); + // revcomp if necessary. Thanks to Thomas Doktor. + if ((_useStrand == true) && (bed.strand == "-")) + reverseComplement(dna); + + if (!(_useName)) { + if (_useFasta == true) { + if (_useStrand == true) + *_faOut << ">" << currChrom << ":" << bed.start << "-" << bed.end << "(" << bed.strand << ")" << endl << dna << endl; + else + *_faOut << ">" << currChrom << ":" << bed.start << "-" << bed.end << endl << dna << endl; + } + else { + if (_useStrand == true) + *_faOut << currChrom << ":" << bed.start << "-" << bed.end << "(" << bed.strand << ")" << "\t" << dna << endl; + else + *_faOut << currChrom << ":" << bed.start << "-" << bed.end << "\t" << dna << endl; + } + } + else { + if (_useFasta == true) + *_faOut << ">" << bed.name << endl << dna << endl; + else + *_faOut << bed.name << "\t" << dna << endl; + } } - else cerr << "Feature (" << bed.chrom << ":" << bed.start << "-" << bed.end << ") beyond " - << currChrom << " size (" << currDNA.size() << " bp). Skipping." << endl; + else cerr << "Feature (" << bed.chrom << ":" << bed.start << "-" << bed.end << ") beyond " + << currChrom << " size (" << currDNA.size() << " bp). Skipping." << endl; } @@ -98,53 +98,53 @@ void Bed2Fa::ReportDNA(const BED &bed, const string &currDNA, const string &curr //****************************************************************************** void Bed2Fa::ExtractDNA() { - /* Make sure that we can oen all of the files successfully*/ - - // open the fasta database for reading - ifstream faDb(_dbFile.c_str(), ios::in); - if ( !faDb ) { - cerr << "Error: The requested fasta database file (" << _dbFile << ") could not be opened. Exiting!" << endl; - exit (1); - } - - // load the BED file into an unbinned map. - _bed->loadBedFileIntoMapNoBin(); - - //Read the fastaDb chromosome by chromosome - string fastaDbLine; - string currChrom; - string currDNA = ""; - currDNA.reserve(500000000); - - while (getline(faDb,fastaDbLine)) { - if (fastaDbLine.find(">",0) != 0 ) { - currDNA += fastaDbLine; - } - else { - if (currDNA.size() > 0) { - - vector<BED>::const_iterator bedItr = _bed->bedMapNoBin[currChrom].begin(); - vector<BED>::const_iterator bedEnd = _bed->bedMapNoBin[currChrom].end(); - // loop through each BED entry for this chrom and print the sequence + /* Make sure that we can oen all of the files successfully*/ + + // open the fasta database for reading + ifstream faDb(_dbFile.c_str(), ios::in); + if ( !faDb ) { + cerr << "Error: The requested fasta database file (" << _dbFile << ") could not be opened. Exiting!" << endl; + exit (1); + } + + // load the BED file into an unbinned map. + _bed->loadBedFileIntoMapNoBin(); + + //Read the fastaDb chromosome by chromosome + string fastaDbLine; + string currChrom; + string currDNA = ""; + currDNA.reserve(500000000); + + while (getline(faDb,fastaDbLine)) { + if (fastaDbLine.find(">",0) != 0 ) { + currDNA += fastaDbLine; + } + else { + if (currDNA.size() > 0) { + + vector<BED>::const_iterator bedItr = _bed->bedMapNoBin[currChrom].begin(); + vector<BED>::const_iterator bedEnd = _bed->bedMapNoBin[currChrom].end(); + // loop through each BED entry for this chrom and print the sequence for (; bedItr != bedEnd; ++bedItr) { - ReportDNA(*bedItr, currDNA, currChrom); + ReportDNA(*bedItr, currDNA, currChrom); } - currDNA = ""; - } - currChrom = fastaDbLine.substr(1, fastaDbLine.find_first_of(" ")-1); - } - } - - // process the last chromosome in the fasta file. - if (currDNA.size() > 0) { - vector<BED>::const_iterator bedItr = _bed->bedMapNoBin[currChrom].begin(); - vector<BED>::const_iterator bedEnd = _bed->bedMapNoBin[currChrom].end(); - // loop through each BED entry for this chrom and print the sequence + currDNA = ""; + } + currChrom = fastaDbLine.substr(1, fastaDbLine.find_first_of(" ")-1); + } + } + + // process the last chromosome in the fasta file. + if (currDNA.size() > 0) { + vector<BED>::const_iterator bedItr = _bed->bedMapNoBin[currChrom].begin(); + vector<BED>::const_iterator bedEnd = _bed->bedMapNoBin[currChrom].end(); + // loop through each BED entry for this chrom and print the sequence for (; bedItr != bedEnd; ++bedItr) { ReportDNA(*bedItr, currDNA, currChrom); } - currDNA = ""; - } + currDNA = ""; + } } diff --git a/src/fastaFromBed/fastaFromBed.h b/src/fastaFromBed/fastaFromBed.h index e8d4ab0a4355e3a82a5f772a61769dcce11f84a5..383d1416b497940595a1049c24ae8d423859b511 100644 --- a/src/fastaFromBed/fastaFromBed.h +++ b/src/fastaFromBed/fastaFromBed.h @@ -26,29 +26,29 @@ using namespace std; class Bed2Fa { public: - - // constructor - Bed2Fa(bool &useName, string &dbFile, string &bedFile, string &fastaOutFile, - bool &useFasta, bool &useStrand); - // destructor - ~Bed2Fa(void); + // constructor + Bed2Fa(bool &useName, string &dbFile, string &bedFile, string &fastaOutFile, + bool &useFasta, bool &useStrand); - void ExtractDNA(); + // destructor + ~Bed2Fa(void); + + void ExtractDNA(); void ReportDNA(const BED &bed, const string &currDNA, const string &currChrom); - + private: - - bool _useName; - string _dbFile; - string _bedFile; - string _fastaOutFile; - bool _useFasta; - bool _useStrand; - - // instance of a bed file class. - BedFile *_bed; + + bool _useName; + string _dbFile; + string _bedFile; + string _fastaOutFile; + bool _useFasta; + bool _useStrand; + + // instance of a bed file class. + BedFile *_bed; ostream *_faOut; }; diff --git a/src/fastaFromBed/fastaFromBedMain.cpp b/src/fastaFromBed/fastaFromBedMain.cpp index 6342c9f20b395b6490afe94a1f741049971327bf..c31676d93228329dad44141c43b5d790aa8471d9 100644 --- a/src/fastaFromBed/fastaFromBedMain.cpp +++ b/src/fastaFromBed/fastaFromBedMain.cpp @@ -26,121 +26,121 @@ void ShowHelp(void); int main(int argc, char* argv[]) { - // our configuration variables - bool showHelp = false; - - // input files - string fastaDbFile; - string bedFile; - - // output files - string fastaOutFile; - - // checks for existence of parameters - bool haveFastaDb = false; - bool haveBed = false; - bool haveFastaOut = false; - bool useNameOnly = false; - bool useFasta = true; - bool useStrand = false; - - // check to see if we should print out some help - if(argc <= 1) showHelp = true; - - for(int i = 1; i < argc; i++) { - int parameterLength = (int)strlen(argv[i]); - - if((PARAMETER_CHECK("-h", 2, parameterLength)) || - (PARAMETER_CHECK("--help", 5, parameterLength))) { - showHelp = true; - } - } - - if(showHelp) ShowHelp(); - - // do some parsing (all of these parameters require 2 strings) - for(int i = 1; i < argc; i++) { - - int parameterLength = (int)strlen(argv[i]); - - if(PARAMETER_CHECK("-fi", 3, parameterLength)) { - if ((i+1) < argc) { - haveFastaDb = true; - fastaDbFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-fo", 3, parameterLength)) { - if ((i+1) < argc) { - haveFastaOut = true; - fastaOutFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-bed", 4, parameterLength)) { - if ((i+1) < argc) { - haveBed = true; - bedFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-name", 5, parameterLength)) { - useNameOnly = true; - } - else if(PARAMETER_CHECK("-tab", 4, parameterLength)) { - useFasta = false; - } - else if(PARAMETER_CHECK("-s", 2, parameterLength)) { - useStrand = true; - } - else { - cerr << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; - showHelp = true; - } - } - - if (!haveFastaDb || !haveFastaOut || !haveBed) { - showHelp = true; - } - - if (!showHelp) { - - Bed2Fa *b2f = new Bed2Fa(useNameOnly, fastaDbFile, bedFile, fastaOutFile, useFasta, useStrand); - delete b2f; - - return 0; - } - else { - ShowHelp(); - } + // our configuration variables + bool showHelp = false; + + // input files + string fastaDbFile; + string bedFile; + + // output files + string fastaOutFile; + + // checks for existence of parameters + bool haveFastaDb = false; + bool haveBed = false; + bool haveFastaOut = false; + bool useNameOnly = false; + bool useFasta = true; + bool useStrand = false; + + // check to see if we should print out some help + if(argc <= 1) showHelp = true; + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if((PARAMETER_CHECK("-h", 2, parameterLength)) || + (PARAMETER_CHECK("--help", 5, parameterLength))) { + showHelp = true; + } + } + + if(showHelp) ShowHelp(); + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-fi", 3, parameterLength)) { + if ((i+1) < argc) { + haveFastaDb = true; + fastaDbFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-fo", 3, parameterLength)) { + if ((i+1) < argc) { + haveFastaOut = true; + fastaOutFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-bed", 4, parameterLength)) { + if ((i+1) < argc) { + haveBed = true; + bedFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-name", 5, parameterLength)) { + useNameOnly = true; + } + else if(PARAMETER_CHECK("-tab", 4, parameterLength)) { + useFasta = false; + } + else if(PARAMETER_CHECK("-s", 2, parameterLength)) { + useStrand = true; + } + else { + cerr << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; + showHelp = true; + } + } + + if (!haveFastaDb || !haveFastaOut || !haveBed) { + showHelp = true; + } + + if (!showHelp) { + + Bed2Fa *b2f = new Bed2Fa(useNameOnly, fastaDbFile, bedFile, fastaOutFile, useFasta, useStrand); + delete b2f; + + return 0; + } + else { + ShowHelp(); + } } void ShowHelp(void) { - - cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; - - cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; - - cerr << "Summary: Extract DNA sequences into a fasta file based on feature coordinates." << endl << endl; - cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -fi <fasta> -bed <bed/gff/vcf> -fo <fasta> " << endl << endl; + cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; - cerr << "Options: " << endl; - cerr << "\t-fi\tInput FASTA file" << endl; - cerr << "\t-bed\tBED/GFF/VCF file of ranges to extract from -fi" << endl; - cerr << "\t-fo\tOutput file (can be FASTA or TAB-delimited)" << endl; - cerr << "\t-name\tUse the name field for the FASTA header" << endl; + cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; - cerr << "\t-tab\tWrite output in TAB delimited format." << endl; - cerr << "\t\t- Default is FASTA format." << endl << endl; + cerr << "Summary: Extract DNA sequences into a fasta file based on feature coordinates." << endl << endl; - cerr << "\t-s\tForce strandedness. If the feature occupies the antisense strand," << endl; - cerr << "\t\tthe sequence will be reverse complemented." << endl; - cerr << "\t\t- By default, strand information is ignored." << endl << endl; + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -fi <fasta> -bed <bed/gff/vcf> -fo <fasta> " << endl << endl; + cerr << "Options: " << endl; + cerr << "\t-fi\tInput FASTA file" << endl; + cerr << "\t-bed\tBED/GFF/VCF file of ranges to extract from -fi" << endl; + cerr << "\t-fo\tOutput file (can be FASTA or TAB-delimited)" << endl; + cerr << "\t-name\tUse the name field for the FASTA header" << endl; + cerr << "\t-tab\tWrite output in TAB delimited format." << endl; + cerr << "\t\t- Default is FASTA format." << endl << endl; + + cerr << "\t-s\tForce strandedness. If the feature occupies the antisense strand," << endl; + cerr << "\t\tthe sequence will be reverse complemented." << endl; + cerr << "\t\t- By default, strand information is ignored." << endl << endl; + + + + // end the program here + exit(1); - // end the program here - exit(1); - } diff --git a/src/fjoin/fjoin.cpp b/src/fjoin/fjoin.cpp index c1cbfa0c16357ddfbd8490af6d5ede994d12aca4..8159118366b698c3e2c7e5fd4533ed26a29b6b1a 100644 --- a/src/fjoin/fjoin.cpp +++ b/src/fjoin/fjoin.cpp @@ -20,144 +20,144 @@ bool leftOf(const BED &a, const BED &b); bool BedIntersect::processHits(BED &a, vector<BED> &hits) { // how many overlaps are there b/w the bed and the set of hits? int s, e, overlapBases; - int numOverlaps = 0; + int numOverlaps = 0; bool hitsFound = false; - int aLength = (a.end - a.start); // the length of a in b.p. - - // loop through the hits and report those that meet the user's criteria - vector<BED>::const_iterator h = hits.begin(); - vector<BED>::const_iterator hitsEnd = hits.end(); - for (; h != hitsEnd; ++h) { - s = max(a.start, h->start); - e = min(a.end, h->end); - overlapBases = (e - s); // the number of overlapping bases b/w a and b - - // is there enough overlap relative to the user's request? (default ~ 1bp) - if ( ( (float) overlapBases / (float) aLength ) >= _overlapFraction ) { - // Report the hit if the user doesn't care about reciprocal overlap between A and B. - if (_reciprocal == false) { - hitsFound = true; - numOverlaps++; - if (_printable == true) - ReportOverlapDetail(overlapBases, a, *h, s, e); - } - // we require there to be sufficient __reciprocal__ overlap - else { - int bLength = (h->end - h->start); - float bOverlap = ( (float) overlapBases / (float) bLength ); - if (bOverlap >= _overlapFraction) { - hitsFound = true; - numOverlaps++; - if (_printable == true) - ReportOverlapDetail(overlapBases, a, *h, s, e); - } - } - } - } - // report the summary of the overlaps if requested. - ReportOverlapSummary(a, numOverlaps); - // were hits found for this BED feature? - return hitsFound; + int aLength = (a.end - a.start); // the length of a in b.p. + + // loop through the hits and report those that meet the user's criteria + vector<BED>::const_iterator h = hits.begin(); + vector<BED>::const_iterator hitsEnd = hits.end(); + for (; h != hitsEnd; ++h) { + s = max(a.start, h->start); + e = min(a.end, h->end); + overlapBases = (e - s); // the number of overlapping bases b/w a and b + + // is there enough overlap relative to the user's request? (default ~ 1bp) + if ( ( (float) overlapBases / (float) aLength ) >= _overlapFraction ) { + // Report the hit if the user doesn't care about reciprocal overlap between A and B. + if (_reciprocal == false) { + hitsFound = true; + numOverlaps++; + if (_printable == true) + ReportOverlapDetail(overlapBases, a, *h, s, e); + } + // we require there to be sufficient __reciprocal__ overlap + else { + int bLength = (h->end - h->start); + float bOverlap = ( (float) overlapBases / (float) bLength ); + if (bOverlap >= _overlapFraction) { + hitsFound = true; + numOverlaps++; + if (_printable == true) + ReportOverlapDetail(overlapBases, a, *h, s, e); + } + } + } + } + // report the summary of the overlaps if requested. + ReportOverlapSummary(a, numOverlaps); + // were hits found for this BED feature? + return hitsFound; } /* - Constructor + Constructor */ -BedIntersect::BedIntersect(string bedAFile, string bedBFile, bool anyHit, - bool writeA, bool writeB, bool writeOverlap, bool writeAllOverlap, - float overlapFraction, bool noHit, bool writeCount, bool forceStrand, - bool reciprocal, bool obeySplits, bool bamInput, bool bamOutput) { - - _bedAFile = bedAFile; - _bedBFile = bedBFile; - _anyHit = anyHit; - _noHit = noHit; - _writeA = writeA; - _writeB = writeB; - _writeOverlap = writeOverlap; - _writeAllOverlap = writeAllOverlap; - _writeCount = writeCount; - _overlapFraction = overlapFraction; - _forceStrand = forceStrand; - _reciprocal = reciprocal; +BedIntersect::BedIntersect(string bedAFile, string bedBFile, bool anyHit, + bool writeA, bool writeB, bool writeOverlap, bool writeAllOverlap, + float overlapFraction, bool noHit, bool writeCount, bool forceStrand, + bool reciprocal, bool obeySplits, bool bamInput, bool bamOutput) { + + _bedAFile = bedAFile; + _bedBFile = bedBFile; + _anyHit = anyHit; + _noHit = noHit; + _writeA = writeA; + _writeB = writeB; + _writeOverlap = writeOverlap; + _writeAllOverlap = writeAllOverlap; + _writeCount = writeCount; + _overlapFraction = overlapFraction; + _forceStrand = forceStrand; + _reciprocal = reciprocal; _obeySplits = obeySplits; - _bamInput = bamInput; - _bamOutput = bamOutput; - - if (_anyHit || _noHit || _writeCount) - _printable = false; - else + _bamInput = bamInput; + _bamOutput = bamOutput; + + if (_anyHit || _noHit || _writeCount) + _printable = false; + else _printable = true; - - // create new BED file objects for A and B - _bedA = new BedFile(bedAFile); - _bedB = new BedFile(bedBFile); - - IntersectBed(); + + // create new BED file objects for A and B + _bedA = new BedFile(bedAFile); + _bedB = new BedFile(bedBFile); + + IntersectBed(); } /* - Destructor + Destructor */ BedIntersect::~BedIntersect(void) { } - - + + bool leftOf(const BED &a, const BED &b) { return (a.end <= b.start); } void BedIntersect::ReportOverlapDetail(const int &overlapBases, const BED &a, const BED &b, - const CHRPOS &s, const CHRPOS &e) { - // default. simple intersection only - if (_writeA == false && _writeB == false && _writeOverlap == false) { - _bedA->reportBedRangeNewLine(a,s,e); - } - // -wa -wbwrite the original A and B - else if (_writeA == true && _writeB == true) { - _bedA->reportBedTab(a); - _bedB->reportBedNewLine(b); - } - // -wa write just the original A - else if (_writeA == true) { - _bedA->reportBedNewLine(a); - } - // -wb write the intersected portion of A and the original B - else if (_writeB == true) { - _bedA->reportBedRangeTab(a,s,e); - _bedB->reportBedNewLine(b); - } - // -wo write the original A and B plus the no. of overlapping bases. - else if (_writeOverlap == true) { - _bedA->reportBedTab(a); - _bedB->reportBedTab(b); - printf("%d\n", overlapBases); - } + const CHRPOS &s, const CHRPOS &e) { + // default. simple intersection only + if (_writeA == false && _writeB == false && _writeOverlap == false) { + _bedA->reportBedRangeNewLine(a,s,e); + } + // -wa -wbwrite the original A and B + else if (_writeA == true && _writeB == true) { + _bedA->reportBedTab(a); + _bedB->reportBedNewLine(b); + } + // -wa write just the original A + else if (_writeA == true) { + _bedA->reportBedNewLine(a); + } + // -wb write the intersected portion of A and the original B + else if (_writeB == true) { + _bedA->reportBedRangeTab(a,s,e); + _bedB->reportBedNewLine(b); + } + // -wo write the original A and B plus the no. of overlapping bases. + else if (_writeOverlap == true) { + _bedA->reportBedTab(a); + _bedB->reportBedTab(b); + printf("%d\n", overlapBases); + } } void BedIntersect::ReportOverlapSummary(const BED &a, const int &numOverlapsFound) { - // -u just report the fact that there was >= 1 overlaps - if (_anyHit && (numOverlapsFound >= 1)) { - _bedA->reportBedNewLine(a); - } - // -c report the total number of features overlapped in B - else if (_writeCount) { - _bedA->reportBedTab(a); - printf("%d\n", numOverlapsFound); - } - // -v report iff there were no overlaps - else if (_noHit && (numOverlapsFound == 0)) { - _bedA->reportBedNewLine(a); - } - // -wao the user wants to force the reporting of 0 overlap - else if (_writeAllOverlap && (numOverlapsFound == 0)) { - _bedA->reportBedTab(a); - _bedB->reportNullBedTab(); - printf("0\n"); - } + // -u just report the fact that there was >= 1 overlaps + if (_anyHit && (numOverlapsFound >= 1)) { + _bedA->reportBedNewLine(a); + } + // -c report the total number of features overlapped in B + else if (_writeCount) { + _bedA->reportBedTab(a); + printf("%d\n", numOverlapsFound); + } + // -v report iff there were no overlaps + else if (_noHit && (numOverlapsFound == 0)) { + _bedA->reportBedNewLine(a); + } + // -wao the user wants to force the reporting of 0 overlap + else if (_writeAllOverlap && (numOverlapsFound == 0)) { + _bedA->reportBedTab(a); + _bedB->reportNullBedTab(); + printf("0\n"); + } } @@ -169,29 +169,29 @@ void BedIntersect::Scan(BED *x, vector<BED *> *windowX, BedLineStatus xStatus, return; } - std::vector<BED *>::iterator wYIter = windowY->begin(); + std::vector<BED *>::iterator wYIter = windowY->begin(); while (wYIter != windowY->end()) { - if (leftOf(*(*wYIter), *x) == true) { + if (leftOf(*(*wYIter), *x) == true) { (*wYIter)->finished = true; wYIter = windowY->erase(wYIter); // erase auto-increments to the next position - } + } else if (overlaps((*wYIter)->start, (*wYIter)->end, x->start, x->end) > 0) { - if (_lastPick == 0) { + if (_lastPick == 0) { AddHits(x, *(*wYIter)); - } + } else { AddHits(*wYIter, *x); } - ++wYIter; // force incrementing + ++wYIter; // force incrementing } } if (leftOf(*x,y) == false) windowX->push_back(x); else { x->finished = true; - } + } // dump the buffered results (if any) - FlushOutputBuffer(); + FlushOutputBuffer(); } @@ -203,14 +203,14 @@ void BedIntersect::AddHits(BED *x, const BED &y) { void BedIntersect::FlushOutputBuffer(bool final) { - while (_outputBuffer.empty() == false) - { + while (_outputBuffer.empty() == false) + { if (final == false && _outputBuffer.front()->finished == false) break; - - processHits(*_outputBuffer.front(), _outputBuffer.front()->overlaps); + + processHits(*_outputBuffer.front(), _outputBuffer.front()->overlaps); // remove the finished BED entry from the heap - delete _outputBuffer.front(); + delete _outputBuffer.front(); _outputBuffer.pop(); } } @@ -218,129 +218,129 @@ void BedIntersect::FlushOutputBuffer(bool final) { vector<BED*>* BedIntersect::GetWindow(const string &chrom, bool isA) { - // iterator to test if a window for a given chrom exists. - map<string, vector<BED*> >::iterator it; - - // grab the current window for A or B, depending on - // the request. if a window hasn't yet been created - // for the requested chrom, create one. - - if (isA) { - it = _windowA.find(chrom); - if (it != _windowA.end()) { - return & _windowA[chrom]; - } - else { - _windowA.insert(pair<string, vector<BED *> >(chrom, vector<BED *>())); - return & _windowA[chrom]; - } - } - else { - it = _windowB.find(chrom); - if (it != _windowB.end()) { - return & _windowB[chrom]; - } - else { - _windowB.insert(pair<string, vector<BED *> >(chrom, vector<BED *>())); - return & _windowB[chrom]; - } - } + // iterator to test if a window for a given chrom exists. + map<string, vector<BED*> >::iterator it; + + // grab the current window for A or B, depending on + // the request. if a window hasn't yet been created + // for the requested chrom, create one. + + if (isA) { + it = _windowA.find(chrom); + if (it != _windowA.end()) { + return & _windowA[chrom]; + } + else { + _windowA.insert(pair<string, vector<BED *> >(chrom, vector<BED *>())); + return & _windowA[chrom]; + } + } + else { + it = _windowB.find(chrom); + if (it != _windowB.end()) { + return & _windowB[chrom]; + } + else { + _windowB.insert(pair<string, vector<BED *> >(chrom, vector<BED *>())); + return & _windowB[chrom]; + } + } } void BedIntersect::ChromSwitch(const string &chrom) { - - vector<BED*>::iterator windowAIter = _windowA[chrom].begin(); - vector<BED*>::iterator windowAEnd = _windowA[chrom].end(); - for (; windowAIter != windowAEnd; ++windowAIter) - (*windowAIter)->finished = true; - - vector<BED*>::iterator windowBIter = _windowB[chrom].begin(); - vector<BED*>::iterator windowBEnd = _windowB[chrom].end(); - for (; windowBIter != windowBEnd; ++windowBIter) - (*windowBIter)->finished = true; - - FlushOutputBuffer(); + + vector<BED*>::iterator windowAIter = _windowA[chrom].begin(); + vector<BED*>::iterator windowAEnd = _windowA[chrom].end(); + for (; windowAIter != windowAEnd; ++windowAIter) + (*windowAIter)->finished = true; + + vector<BED*>::iterator windowBIter = _windowB[chrom].begin(); + vector<BED*>::iterator windowBEnd = _windowB[chrom].end(); + for (; windowBIter != windowBEnd; ++windowBIter) + (*windowBIter)->finished = true; + + FlushOutputBuffer(); } -void BedIntersect::IntersectBed() { - - int aLineNum = 0; - int bLineNum = 0; - +void BedIntersect::IntersectBed() { + + int aLineNum = 0; + int bLineNum = 0; + // current feature from each file - BED *a, *b, *prevA, *prevB; - - // status of the current lines - BedLineStatus aStatus, bStatus; - + BED *a, *b, *prevA, *prevB; + + // status of the current lines + BedLineStatus aStatus, bStatus; + // open the files; get the first line from each _bedA->Open(); _bedB->Open(); - - prevA = NULL; - prevB = NULL; + + prevA = NULL; + prevB = NULL; a = new BED(); b = new BED(); aStatus = _bedA->GetNextBed(*a, aLineNum); bStatus = _bedB->GetNextBed(*b, bLineNum); - + while (aStatus != BED_INVALID || bStatus != BED_INVALID) { - + if ((a->start <= b->start) && (a->chrom == b->chrom)) { - prevA = a; + prevA = a; _lastPick = 0; - Scan(a, GetWindow(a->chrom, true), aStatus, - *b, GetWindow(a->chrom, false), bStatus); + Scan(a, GetWindow(a->chrom, true), aStatus, + *b, GetWindow(a->chrom, false), bStatus); a = new BED(); aStatus = _bedA->GetNextBed(*a, aLineNum); } else if ((a->start > b->start) && (a->chrom == b->chrom)) { - prevB = b; + prevB = b; _lastPick = 1; - Scan(b, GetWindow(b->chrom, false), bStatus, - *a, GetWindow(b->chrom, true), aStatus); - - b = new BED(); + Scan(b, GetWindow(b->chrom, false), bStatus, + *a, GetWindow(b->chrom, true), aStatus); + + b = new BED(); bStatus = _bedB->GetNextBed(*b, bLineNum); } - else if (a->chrom != b->chrom) { - // A was most recently read - if (_lastPick == 0) { - prevB = b; - while (b->chrom == prevA->chrom){ - _windowB[prevA->chrom].push_back(b); - b = new BED(); - bStatus = _bedB->GetNextBed(*b, bLineNum); - } - Scan(prevA, GetWindow(prevA->chrom, true), aStatus, - *prevB, GetWindow(prevA->chrom, false), bStatus); - } - // B was most recently read - else { - prevA = a; - while (a->chrom == prevB->chrom) { - _windowA[prevB->chrom].push_back(a); - a = new BED(); - aStatus = _bedA->GetNextBed(*a, aLineNum); - } - Scan(prevB, GetWindow(prevB->chrom, false), bStatus, - *prevA, GetWindow(prevB->chrom, true), aStatus); - } - FlushOutputBuffer(true); - } - if (prevA!=NULL&&prevB!=NULL) - cout << prevA->chrom << " " << a->chrom << " " << a->start << " " - << prevB->chrom << " " << b->chrom << " " << b->start << "\n"; - if (aStatus == BED_INVALID) a->start = INT_MAX; - if (bStatus == BED_INVALID) b->start = INT_MAX; + else if (a->chrom != b->chrom) { + // A was most recently read + if (_lastPick == 0) { + prevB = b; + while (b->chrom == prevA->chrom){ + _windowB[prevA->chrom].push_back(b); + b = new BED(); + bStatus = _bedB->GetNextBed(*b, bLineNum); + } + Scan(prevA, GetWindow(prevA->chrom, true), aStatus, + *prevB, GetWindow(prevA->chrom, false), bStatus); + } + // B was most recently read + else { + prevA = a; + while (a->chrom == prevB->chrom) { + _windowA[prevB->chrom].push_back(a); + a = new BED(); + aStatus = _bedA->GetNextBed(*a, aLineNum); + } + Scan(prevB, GetWindow(prevB->chrom, false), bStatus, + *prevA, GetWindow(prevB->chrom, true), aStatus); + } + FlushOutputBuffer(true); + } + if (prevA!=NULL&&prevB!=NULL) + cout << prevA->chrom << " " << a->chrom << " " << a->start << " " + << prevB->chrom << " " << b->chrom << " " << b->start << "\n"; + if (aStatus == BED_INVALID) a->start = INT_MAX; + if (bStatus == BED_INVALID) b->start = INT_MAX; } // clear out the final bit of staged output FlushOutputBuffer(true); - + // close the files _bedA->Close(); _bedB->Close(); diff --git a/src/fjoin/fjoin.h b/src/fjoin/fjoin.h index 5cc847040bac1222499b3e1b4b9a5646c0c5e61d..dd0a111bed3e188013bbb85442e880c5ae1c629e 100644 --- a/src/fjoin/fjoin.h +++ b/src/fjoin/fjoin.h @@ -33,82 +33,82 @@ class BedIntersect { public: - // constructor - BedIntersect(string bedAFile, string bedBFile, bool anyHit, - bool writeA, bool writeB, bool writeOverlap, bool writeAllOverlap, - float overlapFraction, bool noHit, bool writeCount, bool forceStrand, - bool reciprocal, bool obeySplits, bool bamInput, bool bamOutput); - - // destructor - ~BedIntersect(void); - + // constructor + BedIntersect(string bedAFile, string bedBFile, bool anyHit, + bool writeA, bool writeB, bool writeOverlap, bool writeAllOverlap, + float overlapFraction, bool noHit, bool writeCount, bool forceStrand, + bool reciprocal, bool obeySplits, bool bamInput, bool bamOutput); + + // destructor + ~BedIntersect(void); + private: - - //------------------------------------------------ - // private attributes - //------------------------------------------------ - string _bedAFile; - string _bedBFile; - - bool _writeA; // should the original A feature be reported? - bool _writeB; // should the original B feature be reported? - bool _writeOverlap; - bool _writeAllOverlap; - - bool _forceStrand; - bool _reciprocal; - float _overlapFraction; - - bool _anyHit; - bool _noHit; - bool _writeCount; // do we want a count of the number of overlaps in B? + + //------------------------------------------------ + // private attributes + //------------------------------------------------ + string _bedAFile; + string _bedBFile; + + bool _writeA; // should the original A feature be reported? + bool _writeB; // should the original B feature be reported? + bool _writeOverlap; + bool _writeAllOverlap; + + bool _forceStrand; + bool _reciprocal; + float _overlapFraction; + + bool _anyHit; + bool _noHit; + bool _writeCount; // do we want a count of the number of overlaps in B? bool _obeySplits; - bool _bamInput; - bool _bamOutput; - + bool _bamInput; + bool _bamOutput; + bool _printable; - + queue<BED*> _outputBuffer; bool _lastPick; - - map<string, vector<BED*> > _windowA; - map<string, vector<BED*> > _windowB; - - // instance of a bed file class. - BedFile *_bedA, *_bedB; - - //------------------------------------------------ - // private methods - //------------------------------------------------ - void IntersectBed(istream &bedInput); - - void Scan(BED *x, vector<BED *> *windowX, BedLineStatus xStatus, + + map<string, vector<BED*> > _windowA; + map<string, vector<BED*> > _windowB; + + // instance of a bed file class. + BedFile *_bedA, *_bedB; + + //------------------------------------------------ + // private methods + //------------------------------------------------ + void IntersectBed(istream &bedInput); + + void Scan(BED *x, vector<BED *> *windowX, BedLineStatus xStatus, const BED &y, vector<BED *> *windowY, BedLineStatus yStatus); void AddHits(BED *x, const BED &y); - + void FlushOutputBuffer(bool final = false); - - vector<BED*>* GetWindow(const string &chrom, bool isA); - - void ChromSwitch(const string &chrom); - - void IntersectBed(); - - void IntersectBam(string bamFile); - + + vector<BED*>* GetWindow(const string &chrom, bool isA); + + void ChromSwitch(const string &chrom); + + void IntersectBed(); + + void IntersectBam(string bamFile); + bool processHits(BED &a, vector<BED> &hits); - - bool FindOverlaps(const BED &a, vector<BED> &hits); - - bool FindOneOrMoreOverlap(const BED &a); - - void ReportOverlapDetail(const int &overlapBases, const BED &a, const BED &b, - const CHRPOS &s, const CHRPOS &e); - void ReportOverlapSummary(const BED &a, const int &numOverlapsFound); - + + bool FindOverlaps(const BED &a, vector<BED> &hits); + + bool FindOneOrMoreOverlap(const BED &a); + + void ReportOverlapDetail(const int &overlapBases, const BED &a, const BED &b, + const CHRPOS &s, const CHRPOS &e); + void ReportOverlapSummary(const BED &a, const int &numOverlapsFound); + void ReportHits(set<BED> &A, set<BED> &B); - + }; #endif /* INTERSECTBED_H */ diff --git a/src/fjoin/fjoinMain.cpp b/src/fjoin/fjoinMain.cpp index 3d91a7caaea8e13659ee025ddef83713734efd5b..48a2ad982a1c6c6074ce53879beb1287cdfb2ef9 100644 --- a/src/fjoin/fjoinMain.cpp +++ b/src/fjoin/fjoinMain.cpp @@ -26,246 +26,246 @@ void ShowHelp(void); int main(int argc, char* argv[]) { - // our configuration variables - bool showHelp = false; - - // input files - string bedAFile; - string bedBFile; - - // input arguments - float overlapFraction = 1E-9; - - bool haveBedA = false; - bool haveBedB = false; - bool noHit = false; - bool anyHit = false; - bool writeA = false; - bool writeB = false; - bool writeCount = false; - bool writeOverlap = false; - bool writeAllOverlap = false; - bool haveFraction = false; - bool reciprocalFraction = false; - bool forceStrand = false; + // our configuration variables + bool showHelp = false; + + // input files + string bedAFile; + string bedBFile; + + // input arguments + float overlapFraction = 1E-9; + + bool haveBedA = false; + bool haveBedB = false; + bool noHit = false; + bool anyHit = false; + bool writeA = false; + bool writeB = false; + bool writeCount = false; + bool writeOverlap = false; + bool writeAllOverlap = false; + bool haveFraction = false; + bool reciprocalFraction = false; + bool forceStrand = false; bool obeySplits = false; - bool inputIsBam = false; - bool outputIsBam = true; - - // check to see if we should print out some help - if(argc <= 1) showHelp = true; - - for(int i = 1; i < argc; i++) { - int parameterLength = (int)strlen(argv[i]); - - if((PARAMETER_CHECK("-h", 2, parameterLength)) || - (PARAMETER_CHECK("--help", 5, parameterLength))) { - showHelp = true; - } - } - - if(showHelp) ShowHelp(); - - // do some parsing (all of these parameters require 2 strings) - for(int i = 1; i < argc; i++) { - - int parameterLength = (int)strlen(argv[i]); - - if(PARAMETER_CHECK("-a", 2, parameterLength)) { - if ((i+1) < argc) { - haveBedA = true; - outputIsBam = false; - bedAFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-abam", 5, parameterLength)) { - if ((i+1) < argc) { - haveBedA = true; - inputIsBam = true; - bedAFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-b", 2, parameterLength)) { - if ((i+1) < argc) { - haveBedB = true; - bedBFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-bed", 4, parameterLength)) { - outputIsBam = false; - } - else if(PARAMETER_CHECK("-u", 2, parameterLength)) { - anyHit = true; - } - else if(PARAMETER_CHECK("-f", 2, parameterLength)) { - if ((i+1) < argc) { - haveFraction = true; - overlapFraction = atof(argv[i + 1]); - i++; - } - } - else if(PARAMETER_CHECK("-wa", 3, parameterLength)) { - writeA = true; - } - else if(PARAMETER_CHECK("-wb", 3, parameterLength)) { - writeB = true; - } - else if(PARAMETER_CHECK("-wo", 3, parameterLength)) { - writeOverlap = true; - } - else if(PARAMETER_CHECK("-wao", 4, parameterLength)) { - writeAllOverlap = true; - writeOverlap = true; - } - else if(PARAMETER_CHECK("-c", 2, parameterLength)) { - writeCount = true; - } - else if(PARAMETER_CHECK("-r", 2, parameterLength)) { - reciprocalFraction = true; - } - else if (PARAMETER_CHECK("-v", 2, parameterLength)) { - noHit = true; - } - else if (PARAMETER_CHECK("-s", 2, parameterLength)) { - forceStrand = true; - } - else if (PARAMETER_CHECK("-split", 6, parameterLength)) { - obeySplits = true; - } - else { - cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; - showHelp = true; - } - } - - // make sure we have both input files - if (!haveBedA || !haveBedB) { - cerr << endl << "*****" << endl << "*****ERROR: Need -a and -b files. " << endl << "*****" << endl; - showHelp = true; - } - - if (anyHit && noHit) { - cerr << endl << "*****" << endl << "*****ERROR: Request either -u OR -v, not both." << endl << "*****" << endl; - showHelp = true; - } - - if (writeB && writeCount) { - cerr << endl << "*****" << endl << "*****ERROR: Request either -wb OR -c, not both." << endl << "*****" << endl; - showHelp = true; - } - - if (writeCount && writeOverlap) { - cerr << endl << "*****" << endl << "*****ERROR: Request either -wb OR -wo, not both." << endl << "*****" << endl; - showHelp = true; - } - - if (writeA && writeOverlap) { - cerr << endl << "*****" << endl << "*****ERROR: Request either -wa OR -wo, not both." << endl << "*****" << endl; - showHelp = true; - } - - if (writeB && writeOverlap) { - cerr << endl << "*****" << endl << "*****ERROR: Request either -wb OR -wo, not both." << endl << "*****" << endl; - showHelp = true; - } - - if (reciprocalFraction && !haveFraction) { - cerr << endl << "*****" << endl << "*****ERROR: If using -r, you need to define -f." << endl << "*****" << endl; - showHelp = true; - } - - if (anyHit && writeCount) { - cerr << endl << "*****" << endl << "*****ERROR: Request either -u OR -c, not both." << endl << "*****" << endl; - showHelp = true; - } - - if (anyHit && writeB) { - cerr << endl << "*****" << endl << "*****ERROR: Request either -u OR -wb, not both." << endl << "*****" << endl; - showHelp = true; - } - - if (anyHit && writeOverlap) { - cerr << endl << "*****" << endl << "*****ERROR: Request either -u OR -wo, not both." << endl << "*****" << endl; - showHelp = true; - } - - - if (!showHelp) { - - BedIntersect *bi = new BedIntersect(bedAFile, bedBFile, anyHit, writeA, writeB, writeOverlap, - writeAllOverlap, overlapFraction, noHit, writeCount, forceStrand, - reciprocalFraction, obeySplits, inputIsBam, outputIsBam); - delete bi; - return 0; - } - else { - ShowHelp(); - } + bool inputIsBam = false; + bool outputIsBam = true; + + // check to see if we should print out some help + if(argc <= 1) showHelp = true; + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if((PARAMETER_CHECK("-h", 2, parameterLength)) || + (PARAMETER_CHECK("--help", 5, parameterLength))) { + showHelp = true; + } + } + + if(showHelp) ShowHelp(); + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-a", 2, parameterLength)) { + if ((i+1) < argc) { + haveBedA = true; + outputIsBam = false; + bedAFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-abam", 5, parameterLength)) { + if ((i+1) < argc) { + haveBedA = true; + inputIsBam = true; + bedAFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-b", 2, parameterLength)) { + if ((i+1) < argc) { + haveBedB = true; + bedBFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-bed", 4, parameterLength)) { + outputIsBam = false; + } + else if(PARAMETER_CHECK("-u", 2, parameterLength)) { + anyHit = true; + } + else if(PARAMETER_CHECK("-f", 2, parameterLength)) { + if ((i+1) < argc) { + haveFraction = true; + overlapFraction = atof(argv[i + 1]); + i++; + } + } + else if(PARAMETER_CHECK("-wa", 3, parameterLength)) { + writeA = true; + } + else if(PARAMETER_CHECK("-wb", 3, parameterLength)) { + writeB = true; + } + else if(PARAMETER_CHECK("-wo", 3, parameterLength)) { + writeOverlap = true; + } + else if(PARAMETER_CHECK("-wao", 4, parameterLength)) { + writeAllOverlap = true; + writeOverlap = true; + } + else if(PARAMETER_CHECK("-c", 2, parameterLength)) { + writeCount = true; + } + else if(PARAMETER_CHECK("-r", 2, parameterLength)) { + reciprocalFraction = true; + } + else if (PARAMETER_CHECK("-v", 2, parameterLength)) { + noHit = true; + } + else if (PARAMETER_CHECK("-s", 2, parameterLength)) { + forceStrand = true; + } + else if (PARAMETER_CHECK("-split", 6, parameterLength)) { + obeySplits = true; + } + else { + cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; + showHelp = true; + } + } + + // make sure we have both input files + if (!haveBedA || !haveBedB) { + cerr << endl << "*****" << endl << "*****ERROR: Need -a and -b files. " << endl << "*****" << endl; + showHelp = true; + } + + if (anyHit && noHit) { + cerr << endl << "*****" << endl << "*****ERROR: Request either -u OR -v, not both." << endl << "*****" << endl; + showHelp = true; + } + + if (writeB && writeCount) { + cerr << endl << "*****" << endl << "*****ERROR: Request either -wb OR -c, not both." << endl << "*****" << endl; + showHelp = true; + } + + if (writeCount && writeOverlap) { + cerr << endl << "*****" << endl << "*****ERROR: Request either -wb OR -wo, not both." << endl << "*****" << endl; + showHelp = true; + } + + if (writeA && writeOverlap) { + cerr << endl << "*****" << endl << "*****ERROR: Request either -wa OR -wo, not both." << endl << "*****" << endl; + showHelp = true; + } + + if (writeB && writeOverlap) { + cerr << endl << "*****" << endl << "*****ERROR: Request either -wb OR -wo, not both." << endl << "*****" << endl; + showHelp = true; + } + + if (reciprocalFraction && !haveFraction) { + cerr << endl << "*****" << endl << "*****ERROR: If using -r, you need to define -f." << endl << "*****" << endl; + showHelp = true; + } + + if (anyHit && writeCount) { + cerr << endl << "*****" << endl << "*****ERROR: Request either -u OR -c, not both." << endl << "*****" << endl; + showHelp = true; + } + + if (anyHit && writeB) { + cerr << endl << "*****" << endl << "*****ERROR: Request either -u OR -wb, not both." << endl << "*****" << endl; + showHelp = true; + } + + if (anyHit && writeOverlap) { + cerr << endl << "*****" << endl << "*****ERROR: Request either -u OR -wo, not both." << endl << "*****" << endl; + showHelp = true; + } + + + if (!showHelp) { + + BedIntersect *bi = new BedIntersect(bedAFile, bedBFile, anyHit, writeA, writeB, writeOverlap, + writeAllOverlap, overlapFraction, noHit, writeCount, forceStrand, + reciprocalFraction, obeySplits, inputIsBam, outputIsBam); + delete bi; + return 0; + } + else { + ShowHelp(); + } } void ShowHelp(void) { - cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; - - cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; - - cerr << "Summary: Report overlaps between two feature files." << endl << endl; - - cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <bed/gff/vcf> -b <bed/gff/vcf>" << endl << endl; - - cerr << "Options: " << endl; - - cerr << "\t-abam\t" << "The A input file is in BAM format. Output will be BAM as well." << endl << endl; - - cerr << "\t-bed\t" << "When using BAM input (-abam), write output as BED. The default" << endl; - cerr << "\t\tis to write output in BAM when using -abam." << endl << endl; - - cerr << "\t-wa\t" << "Write the original entry in A for each overlap." << endl << endl; - - cerr << "\t-wb\t" << "Write the original entry in B for each overlap." << endl; - cerr << "\t\t- Useful for knowing _what_ A overlaps. Restricted by -f and -r." << endl << endl; - - cerr << "\t-wo\t" << "Write the original A and B entries plus the number of base" << endl; - cerr << "\t\tpairs of overlap between the two features." << endl; - cerr << "\t\t- Overlaps restricted by -f and -r." << endl; - cerr << "\t\t Only A features with overlap are reported." << endl << endl; - - cerr << "\t-wao\t" << "Write the original A and B entries plus the number of base" << endl; - cerr << "\t\tpairs of overlap between the two features." << endl; - cerr << "\t\t- Overlapping features restricted by -f and -r." << endl; - cerr << "\t\t However, A features w/o overlap are also reported" << endl; - cerr << "\t\t with a NULL B feature and overlap = 0." << endl << endl; - - cerr << "\t-u\t" << "Write the original A entry _once_ if _any_ overlaps found in B." << endl; - cerr << "\t\t- In other words, just report the fact >=1 hit was found." << endl; - cerr << "\t\t- Overlaps restricted by -f and -r." << endl << endl; - - cerr << "\t-c\t" << "For each entry in A, report the number of overlaps with B." << endl; - cerr << "\t\t- Reports 0 for A entries that have no overlap with B." << endl; - cerr << "\t\t- Overlaps restricted by -f and -r." << endl << endl; - - cerr << "\t-v\t" << "Only report those entries in A that have _no overlaps_ with B." << endl; - cerr << "\t\t- Similar to \"grep -v\" (an homage)." << endl << endl; - - cerr << "\t-f\t" << "Minimum overlap required as a fraction of A." << endl; - cerr << "\t\t- Default is 1E-9 (i.e., 1bp)." << endl; - cerr << "\t\t- FLOAT (e.g. 0.50)" << endl << endl; - - cerr << "\t-r\t" << "Require that the fraction overlap be reciprocal for A and B." << endl; - cerr << "\t\t- In other words, if -f is 0.90 and -r is used, this requires" << endl; - cerr << "\t\t that B overlap 90% of A and A _also_ overlaps 90% of B." << endl << endl; - - cerr << "\t-s\t" << "Force strandedness. That is, only report hits in B that" << endl; - cerr << "\t\toverlap A on the same strand." << endl; - cerr << "\t\t- By default, overlaps are reported without respect to strand." << endl << endl; - - cerr << "\t-split\t" << "Treat \"split\" BAM or BED12 entries as distinct BED intervals." << endl << endl; - - - // end the program here - exit(1); + cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; + + cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; + + cerr << "Summary: Report overlaps between two feature files." << endl << endl; + + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <bed/gff/vcf> -b <bed/gff/vcf>" << endl << endl; + + cerr << "Options: " << endl; + + cerr << "\t-abam\t" << "The A input file is in BAM format. Output will be BAM as well." << endl << endl; + + cerr << "\t-bed\t" << "When using BAM input (-abam), write output as BED. The default" << endl; + cerr << "\t\tis to write output in BAM when using -abam." << endl << endl; + + cerr << "\t-wa\t" << "Write the original entry in A for each overlap." << endl << endl; + + cerr << "\t-wb\t" << "Write the original entry in B for each overlap." << endl; + cerr << "\t\t- Useful for knowing _what_ A overlaps. Restricted by -f and -r." << endl << endl; + + cerr << "\t-wo\t" << "Write the original A and B entries plus the number of base" << endl; + cerr << "\t\tpairs of overlap between the two features." << endl; + cerr << "\t\t- Overlaps restricted by -f and -r." << endl; + cerr << "\t\t Only A features with overlap are reported." << endl << endl; + + cerr << "\t-wao\t" << "Write the original A and B entries plus the number of base" << endl; + cerr << "\t\tpairs of overlap between the two features." << endl; + cerr << "\t\t- Overlapping features restricted by -f and -r." << endl; + cerr << "\t\t However, A features w/o overlap are also reported" << endl; + cerr << "\t\t with a NULL B feature and overlap = 0." << endl << endl; + + cerr << "\t-u\t" << "Write the original A entry _once_ if _any_ overlaps found in B." << endl; + cerr << "\t\t- In other words, just report the fact >=1 hit was found." << endl; + cerr << "\t\t- Overlaps restricted by -f and -r." << endl << endl; + + cerr << "\t-c\t" << "For each entry in A, report the number of overlaps with B." << endl; + cerr << "\t\t- Reports 0 for A entries that have no overlap with B." << endl; + cerr << "\t\t- Overlaps restricted by -f and -r." << endl << endl; + + cerr << "\t-v\t" << "Only report those entries in A that have _no overlaps_ with B." << endl; + cerr << "\t\t- Similar to \"grep -v\" (an homage)." << endl << endl; + + cerr << "\t-f\t" << "Minimum overlap required as a fraction of A." << endl; + cerr << "\t\t- Default is 1E-9 (i.e., 1bp)." << endl; + cerr << "\t\t- FLOAT (e.g. 0.50)" << endl << endl; + + cerr << "\t-r\t" << "Require that the fraction overlap be reciprocal for A and B." << endl; + cerr << "\t\t- In other words, if -f is 0.90 and -r is used, this requires" << endl; + cerr << "\t\t that B overlap 90% of A and A _also_ overlaps 90% of B." << endl << endl; + + cerr << "\t-s\t" << "Force strandedness. That is, only report hits in B that" << endl; + cerr << "\t\toverlap A on the same strand." << endl; + cerr << "\t\t- By default, overlaps are reported without respect to strand." << endl << endl; + + cerr << "\t-split\t" << "Treat \"split\" BAM or BED12 entries as distinct BED intervals." << endl << endl; + + + // end the program here + exit(1); } diff --git a/src/genomeCoverageBed/genomeCoverageBed.cpp b/src/genomeCoverageBed/genomeCoverageBed.cpp index d841e4cb06c733a96d9e2835a690a740e35c7fbc..0bd4ff10c67cd26a47a8dbe6701cf51a4684ecd9 100644 --- a/src/genomeCoverageBed/genomeCoverageBed.cpp +++ b/src/genomeCoverageBed/genomeCoverageBed.cpp @@ -12,88 +12,88 @@ #include "lineFileUtilities.h" #include "genomeCoverageBed.h" - -BedGenomeCoverage::BedGenomeCoverage(string bedFile, string genomeFile, bool eachBase, - bool startSites, bool bedGraph, bool bedGraphAll, - int max, bool bamInput, bool obeySplits, - bool filterByStrand, string requestedStrand) { - - _bedFile = bedFile; - _genomeFile = genomeFile; - _eachBase = eachBase; - _startSites = startSites; - _bedGraph = bedGraph; - _bedGraphAll = bedGraphAll; - _max = max; - _bamInput = bamInput; - _obeySplits = obeySplits; - _filterByStrand = filterByStrand; - _requestedStrand = requestedStrand; - - _bed = new BedFile(bedFile); - _genome = new GenomeFile(genomeFile); - - if (_bamInput == false) - CoverageBed(); - else - CoverageBam(_bed->bedFile); + +BedGenomeCoverage::BedGenomeCoverage(string bedFile, string genomeFile, bool eachBase, + bool startSites, bool bedGraph, bool bedGraphAll, + int max, bool bamInput, bool obeySplits, + bool filterByStrand, string requestedStrand) { + + _bedFile = bedFile; + _genomeFile = genomeFile; + _eachBase = eachBase; + _startSites = startSites; + _bedGraph = bedGraph; + _bedGraphAll = bedGraphAll; + _max = max; + _bamInput = bamInput; + _obeySplits = obeySplits; + _filterByStrand = filterByStrand; + _requestedStrand = requestedStrand; + + _bed = new BedFile(bedFile); + _genome = new GenomeFile(genomeFile); + + if (_bamInput == false) + CoverageBed(); + else + CoverageBam(_bed->bedFile); } BedGenomeCoverage::~BedGenomeCoverage(void) { - delete _bed; - delete _genome; + delete _bed; + delete _genome; } void BedGenomeCoverage::ResetChromCoverage() { - _currChromName = ""; - _currChromSize = 0 ; - std::vector<DEPTH>().swap(_currChromCoverage); + _currChromName = ""; + _currChromSize = 0 ; + std::vector<DEPTH>().swap(_currChromCoverage); } void BedGenomeCoverage::StartNewChrom(const string& newChrom) { - // If we've moved beyond the first encountered chromosomes, - // process the results of the previous chromosome. - if (_currChromName.length() > 0) { - ReportChromCoverage(_currChromCoverage, _currChromSize, - _currChromName, _currChromDepthHist); - } - - // empty the previous chromosome and reserve new - std::vector<DEPTH>().swap(_currChromCoverage); - - if (_visitedChromosomes.find(newChrom) != _visitedChromosomes.end()) { - cerr << "Input error: Chromosome " << _currChromName - << " found in non-sequential lines. This suggests that the input file is not sorted correctly." << endl; - - } - _visitedChromosomes.insert(newChrom); - - _currChromName = newChrom; - - // get the current chrom size and allocate space - _currChromSize = _genome->getChromSize(_currChromName); - - if (_currChromSize >= 0) - _currChromCoverage.resize(_currChromSize); - else { - cerr << "Input error: Chromosome " << _currChromName << " found in your BED file but not in your genome file." << endl; - exit(1); - } + // If we've moved beyond the first encountered chromosomes, + // process the results of the previous chromosome. + if (_currChromName.length() > 0) { + ReportChromCoverage(_currChromCoverage, _currChromSize, + _currChromName, _currChromDepthHist); + } + + // empty the previous chromosome and reserve new + std::vector<DEPTH>().swap(_currChromCoverage); + + if (_visitedChromosomes.find(newChrom) != _visitedChromosomes.end()) { + cerr << "Input error: Chromosome " << _currChromName + << " found in non-sequential lines. This suggests that the input file is not sorted correctly." << endl; + + } + _visitedChromosomes.insert(newChrom); + + _currChromName = newChrom; + + // get the current chrom size and allocate space + _currChromSize = _genome->getChromSize(_currChromName); + + if (_currChromSize >= 0) + _currChromCoverage.resize(_currChromSize); + else { + cerr << "Input error: Chromosome " << _currChromName << " found in your BED file but not in your genome file." << endl; + exit(1); + } } void BedGenomeCoverage::AddCoverage(int start, int end) { - // process the first line for this chromosome. - // make sure the coordinates fit within the chrom - if (start < _currChromSize) - _currChromCoverage[start].starts++; - if (end < _currChromSize) - _currChromCoverage[end].ends++; - else - _currChromCoverage[_currChromSize-1].ends++; + // process the first line for this chromosome. + // make sure the coordinates fit within the chrom + if (start < _currChromSize) + _currChromCoverage[start].starts++; + if (end < _currChromSize) + _currChromCoverage[end].ends++; + else + _currChromCoverage[_currChromSize-1].ends++; } @@ -111,231 +111,231 @@ void BedGenomeCoverage::AddBlockedCoverage(const vector<BED> &bedBlocks) { void BedGenomeCoverage::CoverageBed() { - BED a, nullBed; - int lineNum = 0; // current input line number - BedLineStatus bedStatus; - - ResetChromCoverage(); - - _bed->Open(); - while ( (bedStatus = _bed->GetNextBed(a, lineNum)) != BED_INVALID ) { - if (bedStatus == BED_VALID) { - if (_filterByStrand == true) { - if (a.strand.empty()) { - cerr << "Input error: Interval is missing a strand value on line " << lineNum << "." <<endl; - exit(1); - } - if ( ! (a.strand == "-" || a.strand == "+") ) { - cerr << "Input error: Invalid strand value (" << a.strand << ") on line " << lineNum << "." << endl; - exit(1); - } - // skip if the strand is not what the user requested. - if (a.strand != _requestedStrand) - continue; - } + BED a, nullBed; + int lineNum = 0; // current input line number + BedLineStatus bedStatus; + + ResetChromCoverage(); + + _bed->Open(); + while ( (bedStatus = _bed->GetNextBed(a, lineNum)) != BED_INVALID ) { + if (bedStatus == BED_VALID) { + if (_filterByStrand == true) { + if (a.strand.empty()) { + cerr << "Input error: Interval is missing a strand value on line " << lineNum << "." <<endl; + exit(1); + } + if ( ! (a.strand == "-" || a.strand == "+") ) { + cerr << "Input error: Invalid strand value (" << a.strand << ") on line " << lineNum << "." << endl; + exit(1); + } + // skip if the strand is not what the user requested. + if (a.strand != _requestedStrand) + continue; + } // are we on a new chromosome? - if (a.chrom != _currChromName) - StartNewChrom(a.chrom); + if (a.chrom != _currChromName) + StartNewChrom(a.chrom); - if (_obeySplits == true) { - bedVector bedBlocks; // vec to store the discrete BED "blocks" + if (_obeySplits == true) { + bedVector bedBlocks; // vec to store the discrete BED "blocks" splitBedIntoBlocks(a, lineNum, bedBlocks); - AddBlockedCoverage(bedBlocks); - } - else - AddCoverage(a.start, a.end-1); - } - } - _bed->Close(); - PrintFinalCoverage(); + AddBlockedCoverage(bedBlocks); + } + else + AddCoverage(a.start, a.end-1); + } + } + _bed->Close(); + PrintFinalCoverage(); } void BedGenomeCoverage::PrintFinalCoverage() { - // process the results of the last chromosome. - ReportChromCoverage(_currChromCoverage, _currChromSize, - _currChromName, _currChromDepthHist); - if (_eachBase == false && _bedGraph == false && _bedGraphAll == false) { - ReportGenomeCoverage(_currChromDepthHist); - } + // process the results of the last chromosome. + ReportChromCoverage(_currChromCoverage, _currChromSize, + _currChromName, _currChromDepthHist); + if (_eachBase == false && _bedGraph == false && _bedGraphAll == false) { + ReportGenomeCoverage(_currChromDepthHist); + } } void BedGenomeCoverage::CoverageBam(string bamFile) { ResetChromCoverage(); - - // open the BAM file - BamReader reader; - reader.Open(bamFile); - - // get header & reference information - string header = reader.GetHeaderText(); - RefVector refs = reader.GetReferenceData(); - - // convert each aligned BAM entry to BED - // and compute coverage on B - BamAlignment bam; - while (reader.GetNextAlignment(bam)) { - - // skip if the read is unaligned - if (bam.IsMapped() == false) + + // open the BAM file + BamReader reader; + reader.Open(bamFile); + + // get header & reference information + string header = reader.GetHeaderText(); + RefVector refs = reader.GetReferenceData(); + + // convert each aligned BAM entry to BED + // and compute coverage on B + BamAlignment bam; + while (reader.GetNextAlignment(bam)) { + + // skip if the read is unaligned + if (bam.IsMapped() == false) + continue; + + // skip if we care about strands and the strand isn't what + // the user wanted + if ( (_filterByStrand == true) && + ((_requestedStrand == "-") != bam.IsReverseStrand()) ) continue; - - // skip if we care about strands and the strand isn't what - // the user wanted - if ( (_filterByStrand == true) && - ((_requestedStrand == "-") != bam.IsReverseStrand()) ) - continue; - - // extract the chrom, start and end from the BAM alignment - string chrom(refs.at(bam.RefID).RefName); + + // extract the chrom, start and end from the BAM alignment + string chrom(refs.at(bam.RefID).RefName); CHRPOS start = bam.Position; CHRPOS end = bam.GetEndPosition(false) - 1; - + // are we on a new chromosome? - if ( chrom != _currChromName ) - StartNewChrom(chrom); + if ( chrom != _currChromName ) + StartNewChrom(chrom); // add coverage accordingly. - if (_obeySplits) { - bedVector bedBlocks; - // since we are counting coverage, we do want to split blocks when a - // deletion (D) CIGAR op is encountered (hence the true for the last parm) + if (_obeySplits) { + bedVector bedBlocks; + // since we are counting coverage, we do want to split blocks when a + // deletion (D) CIGAR op is encountered (hence the true for the last parm) getBamBlocks(bam, refs, bedBlocks, true); AddBlockedCoverage(bedBlocks); } else - AddCoverage(start, end); - } - // close the BAM - reader.Close(); - PrintFinalCoverage(); + AddCoverage(start, end); + } + // close the BAM + reader.Close(); + PrintFinalCoverage(); } void BedGenomeCoverage::ReportChromCoverage(const vector<DEPTH> &chromCov, const int &chromSize, const string &chrom, chromHistMap &chromDepthHist) { - - if (_eachBase) { - int depth = 0; // initialize the depth - for (int pos = 0; pos < chromSize; pos++) { - - depth += chromCov[pos].starts; - // report the depth for this position. - cout << chrom << "\t" << pos+1 << "\t" << depth << endl; - depth = depth - chromCov[pos].ends; - } - } - else if (_bedGraph == true || _bedGraphAll == true) { - ReportChromCoverageBedGraph(chromCov, chromSize, chrom); - } - else { - - int depth = 0; // initialize the depth - - for (int pos = 0; pos < chromSize; pos++) { - - depth += chromCov[pos].starts; - - // add the depth at this position to the depth histogram - // for this chromosome. if the depth is greater than the - // maximum bin requested, then readjust the depth to be the max - if (depth >= _max) { - chromDepthHist[chrom][_max]++; - } - else { - chromDepthHist[chrom][depth]++; - } - depth = depth - chromCov[pos].ends; - } - // report the histogram for each chromosome - histMap::const_iterator depthIt = chromDepthHist[chrom].begin(); - histMap::const_iterator depthEnd = chromDepthHist[chrom].end(); - for (; depthIt != depthEnd; ++depthIt) { - int depth = depthIt->first; - unsigned int numBasesAtDepth = depthIt->second; - cout << chrom << "\t" << depth << "\t" << numBasesAtDepth << "\t" - << chromSize << "\t" << (float) ((float)numBasesAtDepth / (float)chromSize) << endl; - } - } + + if (_eachBase) { + int depth = 0; // initialize the depth + for (int pos = 0; pos < chromSize; pos++) { + + depth += chromCov[pos].starts; + // report the depth for this position. + cout << chrom << "\t" << pos+1 << "\t" << depth << endl; + depth = depth - chromCov[pos].ends; + } + } + else if (_bedGraph == true || _bedGraphAll == true) { + ReportChromCoverageBedGraph(chromCov, chromSize, chrom); + } + else { + + int depth = 0; // initialize the depth + + for (int pos = 0; pos < chromSize; pos++) { + + depth += chromCov[pos].starts; + + // add the depth at this position to the depth histogram + // for this chromosome. if the depth is greater than the + // maximum bin requested, then readjust the depth to be the max + if (depth >= _max) { + chromDepthHist[chrom][_max]++; + } + else { + chromDepthHist[chrom][depth]++; + } + depth = depth - chromCov[pos].ends; + } + // report the histogram for each chromosome + histMap::const_iterator depthIt = chromDepthHist[chrom].begin(); + histMap::const_iterator depthEnd = chromDepthHist[chrom].end(); + for (; depthIt != depthEnd; ++depthIt) { + int depth = depthIt->first; + unsigned int numBasesAtDepth = depthIt->second; + cout << chrom << "\t" << depth << "\t" << numBasesAtDepth << "\t" + << chromSize << "\t" << (float) ((float)numBasesAtDepth / (float)chromSize) << endl; + } + } } void BedGenomeCoverage::ReportGenomeCoverage(chromHistMap &chromDepthHist) { - - // get the list of chromosome names in the genome - vector<string> chromList = _genome->getChromList(); - - unsigned int genomeSize = 0; - vector<string>::const_iterator chromItr = chromList.begin(); - vector<string>::const_iterator chromEnd = chromList.end(); - for (; chromItr != chromEnd; ++chromItr) { - string chrom = *chromItr; - genomeSize += _genome->getChromSize(chrom); - // if there were no reads for a give chromosome, then - // add the length of the chrom to the 0 bin. - if ( chromDepthHist.find(chrom) == chromDepthHist.end() ) { - chromDepthHist[chrom][0] += _genome->getChromSize(chrom); - } - } - - histMap genomeHist; // depth histogram for the entire genome - - // loop through each chromosome and add the depth and number of bases at each depth - // to the aggregate histogram for the entire genome - for (chromHistMap::iterator chromIt = chromDepthHist.begin(); chromIt != chromDepthHist.end(); ++chromIt) { - string chrom = chromIt->first; - for (histMap::iterator depthIt = chromDepthHist[chrom].begin(); depthIt != chromDepthHist[chrom].end(); ++depthIt) { - int depth = depthIt->first; - unsigned int numBasesAtDepth = depthIt->second; - genomeHist[depth] += numBasesAtDepth; - } - } - - // loop through the depths for the entire genome - // and report the number and fraction of bases in - // the entire genome that are at said depth. - for (histMap::iterator genomeDepthIt = genomeHist.begin(); genomeDepthIt != genomeHist.end(); ++genomeDepthIt) { - int depth = genomeDepthIt->first; - unsigned int numBasesAtDepth = genomeDepthIt->second; - - cout << "genome" << "\t" << depth << "\t" << numBasesAtDepth << "\t" - << genomeSize << "\t" << (float) ((float)numBasesAtDepth / (float)genomeSize) << endl; - } + + // get the list of chromosome names in the genome + vector<string> chromList = _genome->getChromList(); + + unsigned int genomeSize = 0; + vector<string>::const_iterator chromItr = chromList.begin(); + vector<string>::const_iterator chromEnd = chromList.end(); + for (; chromItr != chromEnd; ++chromItr) { + string chrom = *chromItr; + genomeSize += _genome->getChromSize(chrom); + // if there were no reads for a give chromosome, then + // add the length of the chrom to the 0 bin. + if ( chromDepthHist.find(chrom) == chromDepthHist.end() ) { + chromDepthHist[chrom][0] += _genome->getChromSize(chrom); + } + } + + histMap genomeHist; // depth histogram for the entire genome + + // loop through each chromosome and add the depth and number of bases at each depth + // to the aggregate histogram for the entire genome + for (chromHistMap::iterator chromIt = chromDepthHist.begin(); chromIt != chromDepthHist.end(); ++chromIt) { + string chrom = chromIt->first; + for (histMap::iterator depthIt = chromDepthHist[chrom].begin(); depthIt != chromDepthHist[chrom].end(); ++depthIt) { + int depth = depthIt->first; + unsigned int numBasesAtDepth = depthIt->second; + genomeHist[depth] += numBasesAtDepth; + } + } + + // loop through the depths for the entire genome + // and report the number and fraction of bases in + // the entire genome that are at said depth. + for (histMap::iterator genomeDepthIt = genomeHist.begin(); genomeDepthIt != genomeHist.end(); ++genomeDepthIt) { + int depth = genomeDepthIt->first; + unsigned int numBasesAtDepth = genomeDepthIt->second; + + cout << "genome" << "\t" << depth << "\t" << numBasesAtDepth << "\t" + << genomeSize << "\t" << (float) ((float)numBasesAtDepth / (float)genomeSize) << endl; + } } void BedGenomeCoverage::ReportChromCoverageBedGraph(const vector<DEPTH> &chromCov, const int &chromSize, const string &chrom) { - int depth = 0; // initialize the depth - int lastStart = -1; - int lastDepth = -1; - - for (int pos = 0; pos < chromSize; pos++) { - depth += chromCov[pos].starts; - - if (depth != lastDepth) { - // Coverage depth has changed, print the last interval coverage (if any) - // Print if: - // (1) depth>0 (the default running mode), - // (2) depth==0 and the user requested to print zero covered regions (_bedGraphAll) - if ( (lastDepth != -1) && (lastDepth > 0 || _bedGraphAll) ) { - cout << chrom << "\t" << lastStart << "\t" << pos << "\t" << lastDepth << endl; - } - //Set current position as the new interval start + depth - lastDepth = depth; - lastStart = pos; - } - // Default: the depth has not changed, so we will not print anything. - // Proceed until the depth changes. - // Update depth - depth = depth - chromCov[pos].ends; - } - //Print information about the last position - if ( (lastDepth != -1) && (lastDepth > 0 || _bedGraphAll) ) { - cout << chrom << "\t" << lastStart << "\t" << chromSize << "\t" << lastDepth << endl; - } + int depth = 0; // initialize the depth + int lastStart = -1; + int lastDepth = -1; + + for (int pos = 0; pos < chromSize; pos++) { + depth += chromCov[pos].starts; + + if (depth != lastDepth) { + // Coverage depth has changed, print the last interval coverage (if any) + // Print if: + // (1) depth>0 (the default running mode), + // (2) depth==0 and the user requested to print zero covered regions (_bedGraphAll) + if ( (lastDepth != -1) && (lastDepth > 0 || _bedGraphAll) ) { + cout << chrom << "\t" << lastStart << "\t" << pos << "\t" << lastDepth << endl; + } + //Set current position as the new interval start + depth + lastDepth = depth; + lastStart = pos; + } + // Default: the depth has not changed, so we will not print anything. + // Proceed until the depth changes. + // Update depth + depth = depth - chromCov[pos].ends; + } + //Print information about the last position + if ( (lastDepth != -1) && (lastDepth > 0 || _bedGraphAll) ) { + cout << chrom << "\t" << lastStart << "\t" << chromSize << "\t" << lastDepth << endl; + } } diff --git a/src/genomeCoverageBed/genomeCoverageBed.h b/src/genomeCoverageBed/genomeCoverageBed.h index bcf286cb4e6f2972c3383810801bffa648e228a3..7422e2b320cd22ba40250d4825dc5fc7fa689e40 100644 --- a/src/genomeCoverageBed/genomeCoverageBed.h +++ b/src/genomeCoverageBed/genomeCoverageBed.h @@ -40,50 +40,50 @@ class BedGenomeCoverage { public: - // constructor - BedGenomeCoverage(string bedFile, string genomeFile, bool eachBase, bool startSites, - bool bedGraph, bool bedGraphAll, int max, bool bamInput, bool obeySplits, + // constructor + BedGenomeCoverage(string bedFile, string genomeFile, bool eachBase, bool startSites, + bool bedGraph, bool bedGraphAll, int max, bool bamInput, bool obeySplits, bool filterByStrand, string requestedStrand); - // destructor - ~BedGenomeCoverage(void); + // destructor + ~BedGenomeCoverage(void); private: - // data (parms) - string _bedFile; - string _genomeFile; - bool _bamInput; - bool _eachBase; - bool _startSites; - bool _bedGraph; - bool _bedGraphAll; - int _max; + // data (parms) + string _bedFile; + string _genomeFile; + bool _bamInput; + bool _eachBase; + bool _startSites; + bool _bedGraph; + bool _bedGraphAll; + int _max; bool _obeySplits; - bool _filterByStrand; - string _requestedStrand; - - BedFile *_bed; - GenomeFile *_genome; + bool _filterByStrand; + string _requestedStrand; + + BedFile *_bed; + GenomeFile *_genome; // data for internal processing - chromDepthMap _chromCov; - string _currChromName ; - vector<DEPTH> _currChromCoverage; - chromHistMap _currChromDepthHist; - int _currChromSize ; - set<string> _visitedChromosomes; - - - // methods - void CoverageBed(); - void CoverageBam(string bamFile); - void ReportChromCoverage(const vector<DEPTH> &, const int &chromSize, const string &chrom, chromHistMap&); - void ReportGenomeCoverage(chromHistMap &chromDepthHist); - void ReportChromCoverageBedGraph(const vector<DEPTH> &chromCov, const int &chromSize, const string &chrom); - void ResetChromCoverage(); - void StartNewChrom (const string& chrom); - void AddCoverage (int start, int end); + chromDepthMap _chromCov; + string _currChromName ; + vector<DEPTH> _currChromCoverage; + chromHistMap _currChromDepthHist; + int _currChromSize ; + set<string> _visitedChromosomes; + + + // methods + void CoverageBed(); + void CoverageBam(string bamFile); + void ReportChromCoverage(const vector<DEPTH> &, const int &chromSize, const string &chrom, chromHistMap&); + void ReportGenomeCoverage(chromHistMap &chromDepthHist); + void ReportChromCoverageBedGraph(const vector<DEPTH> &chromCov, const int &chromSize, const string &chrom); + void ResetChromCoverage(); + void StartNewChrom (const string& chrom); + void AddCoverage (int start, int end); void AddBlockedCoverage(const vector<BED> &bedBlocks); - void PrintFinalCoverage(); + void PrintFinalCoverage(); }; diff --git a/src/genomeCoverageBed/genomeCoverageMain.cpp b/src/genomeCoverageBed/genomeCoverageMain.cpp index 27d426858cf89083c53730755bdc152527e3d692..a652d9360d7b23baa069d86f784934cf334cfb91 100644 --- a/src/genomeCoverageBed/genomeCoverageMain.cpp +++ b/src/genomeCoverageBed/genomeCoverageMain.cpp @@ -26,201 +26,201 @@ void ShowHelp(void); int main(int argc, char* argv[]) { - // our configuration variables - bool showHelp = false; - - // input files - string bedFile; - string genomeFile; - int max = INT_MAX; - - bool haveBed = false; - bool bamInput = false; - bool haveGenome = false; - bool startSites = false; - bool bedGraph = false; - bool bedGraphAll = false; - bool eachBase = false; - bool obeySplits = false; - bool filterByStrand = false; - string requestedStrand = "X"; - - // check to see if we should print out some help - if(argc <= 1) showHelp = true; - - for(int i = 1; i < argc; i++) { - int parameterLength = (int)strlen(argv[i]); - - if((PARAMETER_CHECK("-h", 2, parameterLength)) || - (PARAMETER_CHECK("--help", 5, parameterLength))) { - showHelp = true; - } - } - - if(showHelp) ShowHelp(); - - // do some parsing (all of these parameters require 2 strings) - for(int i = 1; i < argc; i++) { - - int parameterLength = (int)strlen(argv[i]); - - if(PARAMETER_CHECK("-i", 2, parameterLength)) { - if ((i+1) < argc) { - haveBed = true; - bedFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-ibam", 5, parameterLength)) { - if ((i+1) < argc) { - haveBed = true; - bamInput = true; - bedFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-g", 2, parameterLength)) { - if ((i+1) < argc) { - haveGenome = true; - genomeFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-d", 2, parameterLength)) { - eachBase = true; - } - else if(PARAMETER_CHECK("-bg", 3, parameterLength)) { - bedGraph = true; - } - else if(PARAMETER_CHECK("-bga", 4, parameterLength)) { - bedGraphAll = true; - } - else if(PARAMETER_CHECK("-max", 4, parameterLength)) { - if ((i+1) < argc) { - max = atoi(argv[i + 1]); - i++; - } - } - else if(PARAMETER_CHECK("-split", 6, parameterLength)) { - obeySplits = true; - } - else if(PARAMETER_CHECK("-strand", 7, parameterLength)) { - if ((i+1) < argc) { - filterByStrand = true; - requestedStrand = argv[i+1][0]; - if (!(requestedStrand == "-" || requestedStrand == "+")) { - cerr << "*****ERROR: invalid -strand value (" << requestedStrand << "). Allowed options are + or -" << endl; - showHelp = true; - } - i++; - } - else { - cerr << "*****ERROR: -strand options requires a value: + or -" << endl; - showHelp = true; - } - } - else { - cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; - showHelp = true; - } - } - - // make sure we have both input files - if (!haveBed || !haveGenome) { - cerr << endl << "*****" << endl << "*****ERROR: Need both a BED (-i) and a genome (-g) file. " << endl << "*****" << endl; - showHelp = true; - } - if (bedGraph && eachBase) { - cerr << endl << "*****" << endl << "*****ERROR: Use -d or -bg, not both" << endl << "*****" << endl; - showHelp = true; - } - if (bedGraphAll && eachBase) { - cerr << endl << "*****" << endl << "*****ERROR: Use -d or -bga, not both" << endl << "*****" << endl; - showHelp = true; - } - - if (!showHelp) { - - BedGenomeCoverage *bc = new BedGenomeCoverage(bedFile, genomeFile, eachBase, - startSites, bedGraph, bedGraphAll, + // our configuration variables + bool showHelp = false; + + // input files + string bedFile; + string genomeFile; + int max = INT_MAX; + + bool haveBed = false; + bool bamInput = false; + bool haveGenome = false; + bool startSites = false; + bool bedGraph = false; + bool bedGraphAll = false; + bool eachBase = false; + bool obeySplits = false; + bool filterByStrand = false; + string requestedStrand = "X"; + + // check to see if we should print out some help + if(argc <= 1) showHelp = true; + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if((PARAMETER_CHECK("-h", 2, parameterLength)) || + (PARAMETER_CHECK("--help", 5, parameterLength))) { + showHelp = true; + } + } + + if(showHelp) ShowHelp(); + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-i", 2, parameterLength)) { + if ((i+1) < argc) { + haveBed = true; + bedFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-ibam", 5, parameterLength)) { + if ((i+1) < argc) { + haveBed = true; + bamInput = true; + bedFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-g", 2, parameterLength)) { + if ((i+1) < argc) { + haveGenome = true; + genomeFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-d", 2, parameterLength)) { + eachBase = true; + } + else if(PARAMETER_CHECK("-bg", 3, parameterLength)) { + bedGraph = true; + } + else if(PARAMETER_CHECK("-bga", 4, parameterLength)) { + bedGraphAll = true; + } + else if(PARAMETER_CHECK("-max", 4, parameterLength)) { + if ((i+1) < argc) { + max = atoi(argv[i + 1]); + i++; + } + } + else if(PARAMETER_CHECK("-split", 6, parameterLength)) { + obeySplits = true; + } + else if(PARAMETER_CHECK("-strand", 7, parameterLength)) { + if ((i+1) < argc) { + filterByStrand = true; + requestedStrand = argv[i+1][0]; + if (!(requestedStrand == "-" || requestedStrand == "+")) { + cerr << "*****ERROR: invalid -strand value (" << requestedStrand << "). Allowed options are + or -" << endl; + showHelp = true; + } + i++; + } + else { + cerr << "*****ERROR: -strand options requires a value: + or -" << endl; + showHelp = true; + } + } + else { + cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; + showHelp = true; + } + } + + // make sure we have both input files + if (!haveBed || !haveGenome) { + cerr << endl << "*****" << endl << "*****ERROR: Need both a BED (-i) and a genome (-g) file. " << endl << "*****" << endl; + showHelp = true; + } + if (bedGraph && eachBase) { + cerr << endl << "*****" << endl << "*****ERROR: Use -d or -bg, not both" << endl << "*****" << endl; + showHelp = true; + } + if (bedGraphAll && eachBase) { + cerr << endl << "*****" << endl << "*****ERROR: Use -d or -bga, not both" << endl << "*****" << endl; + showHelp = true; + } + + if (!showHelp) { + + BedGenomeCoverage *bc = new BedGenomeCoverage(bedFile, genomeFile, eachBase, + startSites, bedGraph, bedGraphAll, max, bamInput, obeySplits, - filterByStrand, requestedStrand); - delete bc; - - return 0; - } - else { - ShowHelp(); - } + filterByStrand, requestedStrand); + delete bc; + + return 0; + } + else { + ShowHelp(); + } } void ShowHelp(void) { - - cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; - - cerr << "Authors: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; - cerr << " Assaf Gordon, CSHL" << endl << endl; - - cerr << "Summary: Compute the coverage of a feature file among a genome." << endl << endl; - - cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bed/gff/vcf> -g <genome>" << endl << endl; - - cerr << "Options: " << endl; - - cerr << "\t-ibam\t" << "The input file is in BAM format." << endl; - cerr << "\t\tNote: BAM _must_ be sorted by position" << endl << endl; - - cerr << "\t-d\t" << "Report the depth at each genome position." << endl; - cerr << "\t\tDefault behavior is to report a histogram." << endl << endl; - - cerr << "\t-bg\t" << "Report depth in BedGraph format. For details, see:" << endl; - cerr << "\t\tgenome.ucsc.edu/goldenPath/help/bedgraph.html" << endl << endl; - - cerr << "\t-bga\t" << "Report depth in BedGraph format, as above (-bg)." << endl; - cerr << "\t\tHowever with this option, regions with zero " << endl; - cerr << "\t\tcoverage are also reported. This allows one to" << endl; - cerr << "\t\tquickly extract all regions of a genome with 0 " << endl; - cerr << "\t\tcoverage by applying: \"grep -w 0$\" to the output." << endl << endl; - - cerr << "\t-split\t" << "Treat \"split\" BAM or BED12 entries as distinct BED intervals." << endl; - cerr << "\t\twhen computing coverage." << endl; - cerr << "\t\tFor BAM files, this uses the CIGAR \"N\" and \"D\" operations " << endl; - cerr << "\t\tto infer the blocks for computing coverage." << endl; - cerr << "\t\tFor BED12 files, this uses the BlockCount, BlockStarts, and BlockEnds" << endl; - cerr << "\t\tfields (i.e., columns 10,11,12)." << endl << endl; - - cerr << "\t-strand\t" << "Calculate coverage of intervals from a specific strand." << endl; - cerr << "\t\tWith BED files, requires at least 6 columns (strand is column 6). " << endl; - cerr << "\t\t- (STRING): can be + or -" << endl << endl; - - - cerr << "\t-max\t" << "Combine all positions with a depth >= max into" << endl; - cerr << "\t\ta single bin in the histogram. Irrelevant" << endl; - cerr << "\t\tfor -d and -bedGraph" << endl; - cerr << "\t\t- (INTEGER)" << endl << endl; - - cerr << "Notes: " << endl; - cerr << "\t(1) The genome file should tab delimited and structured as follows:" << endl; - cerr << "\t <chromName><TAB><chromSize>" << endl << endl; - cerr << "\tFor example, Human (hg19):" << endl; - cerr << "\tchr1\t249250621" << endl; - cerr << "\tchr2\t243199373" << endl; - cerr << "\t..." << endl; - cerr << "\tchr18_gl000207_random\t4262" << endl << endl; - - cerr << "\t(2) The input BED (-i) file must be grouped by chromosome." << endl; - cerr << "\t A simple \"sort -k 1,1 <BED> > <BED>.sorted\" will suffice."<< endl << endl; - - cerr << "\t(3) The input BAM (-ibam) file must be sorted by position." << endl; - cerr << "\t A \"samtools sort <BAM>\" should suffice."<< endl << endl; - - cerr << "Tips: " << endl; - cerr << "\tOne can use the UCSC Genome Browser's MySQL database to extract" << endl; - cerr << "\tchromosome sizes. For example, H. sapiens:" << endl << endl; - cerr << "\tmysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e /" << endl; - cerr << "\t\"select chrom, size from hg19.chromInfo\" > hg19.genome" << endl << endl; - - - // end the program here - exit(1); + + cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; + + cerr << "Authors: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; + cerr << " Assaf Gordon, CSHL" << endl << endl; + + cerr << "Summary: Compute the coverage of a feature file among a genome." << endl << endl; + + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bed/gff/vcf> -g <genome>" << endl << endl; + + cerr << "Options: " << endl; + + cerr << "\t-ibam\t" << "The input file is in BAM format." << endl; + cerr << "\t\tNote: BAM _must_ be sorted by position" << endl << endl; + + cerr << "\t-d\t" << "Report the depth at each genome position." << endl; + cerr << "\t\tDefault behavior is to report a histogram." << endl << endl; + + cerr << "\t-bg\t" << "Report depth in BedGraph format. For details, see:" << endl; + cerr << "\t\tgenome.ucsc.edu/goldenPath/help/bedgraph.html" << endl << endl; + + cerr << "\t-bga\t" << "Report depth in BedGraph format, as above (-bg)." << endl; + cerr << "\t\tHowever with this option, regions with zero " << endl; + cerr << "\t\tcoverage are also reported. This allows one to" << endl; + cerr << "\t\tquickly extract all regions of a genome with 0 " << endl; + cerr << "\t\tcoverage by applying: \"grep -w 0$\" to the output." << endl << endl; + + cerr << "\t-split\t" << "Treat \"split\" BAM or BED12 entries as distinct BED intervals." << endl; + cerr << "\t\twhen computing coverage." << endl; + cerr << "\t\tFor BAM files, this uses the CIGAR \"N\" and \"D\" operations " << endl; + cerr << "\t\tto infer the blocks for computing coverage." << endl; + cerr << "\t\tFor BED12 files, this uses the BlockCount, BlockStarts, and BlockEnds" << endl; + cerr << "\t\tfields (i.e., columns 10,11,12)." << endl << endl; + + cerr << "\t-strand\t" << "Calculate coverage of intervals from a specific strand." << endl; + cerr << "\t\tWith BED files, requires at least 6 columns (strand is column 6). " << endl; + cerr << "\t\t- (STRING): can be + or -" << endl << endl; + + + cerr << "\t-max\t" << "Combine all positions with a depth >= max into" << endl; + cerr << "\t\ta single bin in the histogram. Irrelevant" << endl; + cerr << "\t\tfor -d and -bedGraph" << endl; + cerr << "\t\t- (INTEGER)" << endl << endl; + + cerr << "Notes: " << endl; + cerr << "\t(1) The genome file should tab delimited and structured as follows:" << endl; + cerr << "\t <chromName><TAB><chromSize>" << endl << endl; + cerr << "\tFor example, Human (hg19):" << endl; + cerr << "\tchr1\t249250621" << endl; + cerr << "\tchr2\t243199373" << endl; + cerr << "\t..." << endl; + cerr << "\tchr18_gl000207_random\t4262" << endl << endl; + + cerr << "\t(2) The input BED (-i) file must be grouped by chromosome." << endl; + cerr << "\t A simple \"sort -k 1,1 <BED> > <BED>.sorted\" will suffice."<< endl << endl; + + cerr << "\t(3) The input BAM (-ibam) file must be sorted by position." << endl; + cerr << "\t A \"samtools sort <BAM>\" should suffice."<< endl << endl; + + cerr << "Tips: " << endl; + cerr << "\tOne can use the UCSC Genome Browser's MySQL database to extract" << endl; + cerr << "\tchromosome sizes. For example, H. sapiens:" << endl << endl; + cerr << "\tmysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e /" << endl; + cerr << "\t\"select chrom, size from hg19.chromInfo\" > hg19.genome" << endl << endl; + + + // end the program here + exit(1); } diff --git a/src/groupBy/groupBy.cpp b/src/groupBy/groupBy.cpp index a7d4969aa69eab9f7662e06a8ff767a9764e3ea4..c49038b119af238dca39db862a96f9ea166297ca 100644 --- a/src/groupBy/groupBy.cpp +++ b/src/groupBy/groupBy.cpp @@ -68,7 +68,7 @@ double ToDouble(const string &element); void TabPrintPost (string element); void TabPrintPre (string element); void CommaPrint (string element); - + int main(int argc, char* argv[]) { // input files @@ -76,19 +76,19 @@ int main(int argc, char* argv[]) { string groupColumnsString = "1,2,3"; string opsColumnString; string opsString; - + // our configuration variables bool showHelp = false; - bool haveOpColumns = false; - bool haveOps = true; - + bool haveOpColumns = false; + bool haveOps = true; + // check to see if we should print out some help if(argc <= 1) showHelp = true; for(int i = 1; i < argc; i++) { int parameterLength = (int)strlen(argv[i]); - if((PARAMETER_CHECK("-h", 2, parameterLength)) || + if((PARAMETER_CHECK("-h", 2, parameterLength)) || (PARAMETER_CHECK("--help", 5, parameterLength))) { showHelp = true; } @@ -106,51 +106,51 @@ int main(int argc, char* argv[]) { cerr << endl << "*****ERROR: -i parameter requires a value." << endl << endl; ShowHelp(); break; - } + } else { - inFile = argv[i + 1]; - i++; + inFile = argv[i + 1]; + i++; } } else if (PARAMETER_CHECK("-grp", 4, parameterLength) || PARAMETER_CHECK("-g", 2, parameterLength)) { - if ((i+1) >= argc || LOOKS_LIKE_A_PARAM(argv[i+1])) { - cerr << endl << "*****ERROR: -grp parameter requires a value." << endl << endl; + if ((i+1) >= argc || LOOKS_LIKE_A_PARAM(argv[i+1])) { + cerr << endl << "*****ERROR: -grp parameter requires a value." << endl << endl; ShowHelp(); - break; - } - else { - groupColumnsString = argv[i + 1]; - i++; - } + break; + } + else { + groupColumnsString = argv[i + 1]; + i++; + } } else if(PARAMETER_CHECK("-opCols", 7, parameterLength) || PARAMETER_CHECK("-c", 2, parameterLength)) { - if ((i+1) >= argc || LOOKS_LIKE_A_PARAM(argv[i+1])) { - cerr << endl << "*****ERROR: -opCols parameter requires a value." << endl << endl; + if ((i+1) >= argc || LOOKS_LIKE_A_PARAM(argv[i+1])) { + cerr << endl << "*****ERROR: -opCols parameter requires a value." << endl << endl; ShowHelp(); - break; - } - else { - haveOpColumns = true; - opsColumnString = argv[i + 1]; - i++; - } + break; + } + else { + haveOpColumns = true; + opsColumnString = argv[i + 1]; + i++; + } } else if(PARAMETER_CHECK("-ops", 4, parameterLength) || PARAMETER_CHECK("-o", 2, parameterLength)) { - if ((i+1) >= argc || LOOKS_LIKE_A_PARAM(argv[i+1])) { - cerr << endl << "*****ERROR: -ops parameter requires a value." << endl << endl; + if ((i+1) >= argc || LOOKS_LIKE_A_PARAM(argv[i+1])) { + cerr << endl << "*****ERROR: -ops parameter requires a value." << endl << endl; ShowHelp(); break; - } - else { - haveOps = true; - opsString = argv[i + 1]; - i++; - } + } + else { + haveOps = true; + opsString = argv[i + 1]; + i++; + } } else { cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; showHelp = true; - } + } } if (!haveOpColumns) { @@ -165,45 +165,45 @@ int main(int argc, char* argv[]) { (ops[i] != "mode") && (ops[i] != "median") && (ops[i] != "antimode") && (ops[i] != "stdev") && (ops[i] != "sstdev") && (ops[i] != "count") && (ops[i] != "collapse") && (ops[i] != "freqdesc") && (ops[i] != "freqasc")) { - cerr << endl << "*****" << endl << "*****ERROR: Invalid operation selection \"" << ops[i] << endl << "*****" << endl; - showHelp = true; + cerr << endl << "*****" << endl << "*****ERROR: Invalid operation selection \"" << ops[i] << endl << "*****" << endl; + showHelp = true; } } if (!showHelp) { - + // Split the column string sent by the user into discrete column numbers // A comma separated string is expected. vector<int> groupColumnsInt; Tokenize(groupColumnsString, groupColumnsInt, ","); - + vector<int> opColumnsInt; Tokenize(opsColumnString, opColumnsInt, ","); - - // sanity check the group columns + + // sanity check the group columns for(size_t i = 0; i < groupColumnsInt.size(); ++i) { int groupColumnInt = groupColumnsInt[i]; if (groupColumnInt < 1) { - cerr << endl << "*****" << endl << "*****ERROR: group columns must be >=1. " << endl << "*****" << endl; - ShowHelp(); + cerr << endl << "*****" << endl << "*****ERROR: group columns must be >=1. " << endl << "*****" << endl; + ShowHelp(); } } - - // sanity check the op columns + + // sanity check the op columns for(size_t i = 0; i < opColumnsInt.size(); ++i) { int opColumnInt = opColumnsInt[i]; if (opColumnInt < 1) { - cerr << endl << "*****" << endl << "*****ERROR: op columns must be >=1. " << endl << "*****" << endl; - ShowHelp(); + cerr << endl << "*****" << endl << "*****ERROR: op columns must be >=1. " << endl << "*****" << endl; + ShowHelp(); } } - + // sanity check that there are equal number of opColumns and ops if (ops.size() != opColumnsInt.size()) { - cerr << endl << "*****" << endl << "*****ERROR: There must be equal number of ops and opCols. " << endl << "*****" << endl; - ShowHelp(); + cerr << endl << "*****" << endl << "*****ERROR: There must be equal number of ops and opCols. " << endl << "*****" << endl; + ShowHelp(); } GroupBy(inFile, groupColumnsInt, opColumnsInt, ops); - } + } else { ShowHelp(); } @@ -212,20 +212,20 @@ int main(int argc, char* argv[]) { void ShowHelp(void) { cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; - + cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; cerr << "Summary: Summarizes a dataset column based upon" << endl; cerr << "\t common column groupings. Akin to the SQL \"group by\" command." << endl << endl; - + cerr << "Usage: " << PROGRAM_NAME << " -i <input> -g <group_column(s)> -c <op_column(s)> -o <ops> " << endl << endl; cerr << "Options: " << endl; cerr << "\t-i\t\t" << "Input file. Use \"stdin\" for pipes." << endl << endl; - + cerr << "\t-g -grp\t\t" << "Specify the columns (1-based) for the grouping." << endl; cerr << "\t\t\tThe columns must be comma separated." << endl; - cerr << "\t\t\t- Default: 1,2,3" << endl << endl; + cerr << "\t\t\t- Default: 1,2,3" << endl << endl; cerr << "\t-c -opCols\t" << "Specify the column (1-based) that should be summarized." << endl; cerr << "\t\t\t- Required." << endl << endl; @@ -234,7 +234,7 @@ void ShowHelp(void) { cerr << "\t\t\tValid operations:" << endl; cerr << "\t\t\t sum, count, min, max," << endl; cerr << "\t\t\t mean, median, mode, antimode," << endl; - cerr << "\t\t\t stdev, sstdev (sample standard dev.)," << endl; + cerr << "\t\t\t stdev, sstdev (sample standard dev.)," << endl; cerr << "\t\t\t collapse (i.e., print a comma separated list), " << endl; cerr << "\t\t\t freqdesc (i.e., print desc. list of values:freq)" << endl; cerr << "\t\t\t freqasc (i.e., print asc. list of values:freq)" << endl; @@ -250,49 +250,49 @@ void ShowHelp(void) { cerr << "\tchr1 10 20 A 11000 10000" << endl << endl; cerr << "\t$ groupBy -i ex1.out -g 1,2,3,4 -c 8,9 -o collapse,mean" << endl; cerr << "\tchr1 10 20 A B.1,B.2, 5500" << endl << endl; - + cerr << "Notes: " << endl; cerr << "\t(1) The input file/stream should be sorted/grouped by the -grp. columns" << endl << endl; cerr << "\t(2) If -i is unspecified, input is assumed to come from stdin." << endl << endl; - + // end the program here exit(1); } -void GroupBy (const string &inFile, - const vector<int> &groupColumns, - const vector<int> &opColumns, +void GroupBy (const string &inFile, + const vector<int> &groupColumns, + const vector<int> &opColumns, const vector<string> &ops) { - + // current line number int lineNum = 0; // string representing current line string inLine; - + // vector of strings holding the tokenized current line vector<string> inFields; inFields.reserve(20); - + // keys for the current and previous group vector<string> prevGroup(0); vector<string> currGroup(0); - + // vector (one per column) of vector (one per value/column) of the opColumn values for the current group vector< vector<string> > values; for( size_t i = 0; i < opColumns.size(); i++ ) { values.push_back( vector<string>() ); } - + // check the status of the current line TabLineStatus tabLineStatus; - + // open a new tab file, loop through it line by line // and summarize the data for a given group when the group // fields change - TabFile *_tab = new TabFile(inFile); + TabFile *_tab = new TabFile(inFile); _tab->Open(); while ((tabLineStatus = _tab->GetNextTabLine(inFields, lineNum)) != TAB_INVALID) { if (tabLineStatus == TAB_VALID) { @@ -331,16 +331,16 @@ void GroupBy (const string &inFile, void ReportSummary(const vector<string> &group, const vector<vector<string> > &data, const vector<string> &ops) { - + vector<string> result; for( size_t i = 0; i < data.size(); i++ ) { - + string op = ops[i]; std::stringstream buffer; vector<double> dataF; // are we doing a numeric conversion? if so, convert the strings to doubles. if ((op == "sum") || (op == "max") || (op == "min") || (op == "mean") || - (op == "median") || (op == "stdev") || (op == "sstdev")) + (op == "median") || (op == "stdev") || (op == "sstdev")) { transform(data[i].begin(), data[i].end(), back_inserter(dataF), ToDouble); } @@ -395,7 +395,7 @@ void ReportSummary(const vector<string> &group, const vector<vector<string> > &d buffer << setprecision (PRECISION) << data[i].size(); result.push_back(buffer.str()); } - else if ((op == "mode") || (op == "antimode") || + else if ((op == "mode") || (op == "antimode") || (op == "freqdesc") || (op == "freqasc")) { // compute the frequency of each unique value map<string, int> freqs; @@ -404,7 +404,7 @@ void ReportSummary(const vector<string> &group, const vector<vector<string> > &d for (; dIt != dEnd; ++dIt) { freqs[*dIt]++; } - + // grab the mode and the anti mode string mode, antiMode; int count = 0; @@ -429,25 +429,25 @@ void ReportSummary(const vector<string> &group, const vector<vector<string> > &d result.push_back(buffer.str()); } else if (op == "freqdesc" || op == "freqasc") { - // pair for the num times a values was + // pair for the num times a values was // observed (1) and the value itself (2) pair<int, string> freqPair; vector< pair<int, string> > freqList; - + // create a list of pairs of all the observed values (second) // and their occurences (first) map<string,int>::const_iterator mapIter = freqs.begin(); - map<string,int>::const_iterator mapEnd = freqs.end(); + map<string,int>::const_iterator mapEnd = freqs.end(); for(; mapIter != mapEnd; ++mapIter) freqList.push_back( make_pair(mapIter->second, mapIter->first) ); - + // sort the list of pairs in the requested order by the frequency // this will make the value that was observed least/most bubble to the top if (op == "freqdesc") sort(freqList.begin(), freqList.end(), ValueGreaterThan()); else if (op == "freqasc") sort(freqList.begin(), freqList.end(), ValueLessThan()); - + // record all of the values and their frequencies. vector< pair<int, string> >::const_iterator iter = freqList.begin(); vector< pair<int, string> >::const_iterator iterEnd = freqList.end(); @@ -492,8 +492,8 @@ void addValue (const vector<string> &fromList, vector<string> &toList, int index toList.push_back(fromList.at(index)); } catch(std::out_of_range& e) { - cerr << endl << "*****" << endl << "*****ERROR: requested column exceeds the number of columns in file at line " - << lineNum << ". Exiting." << endl << "*****" << endl; + cerr << endl << "*****" << endl << "*****ERROR: requested column exceeds the number of columns in file at line " + << lineNum << ". Exiting." << endl << "*****" << endl; exit(1); } } diff --git a/src/intersectBed/intersectBed.cpp b/src/intersectBed/intersectBed.cpp index b6aae98e4b8885f2d14fe7fd6e32190130b875fa..8e94677ea531b62bb40d638f3d8d577a60fc0dc0 100644 --- a/src/intersectBed/intersectBed.cpp +++ b/src/intersectBed/intersectBed.cpp @@ -19,101 +19,101 @@ bool BedIntersect::processHits(const BED &a, const vector<BED> &hits, bool print // how many overlaps are there b/w the bed and the set of hits? int s, e, overlapBases; - int numOverlaps = 0; + int numOverlaps = 0; bool hitsFound = false; - int aLength = (a.end - a.start); // the length of a in b.p. - - // loop through the hits and report those that meet the user's criteria - vector<BED>::const_iterator h = hits.begin(); - vector<BED>::const_iterator hitsEnd = hits.end(); - for (; h != hitsEnd; ++h) { - s = max(a.start, h->start); - e = min(a.end, h->end); - overlapBases = (e - s); // the number of overlapping bases b/w a and b - - // is there enough overlap relative to the user's request? (default ~ 1bp) - if ( ( (float) overlapBases / (float) aLength ) >= _overlapFraction ) { - // Report the hit if the user doesn't care about reciprocal overlap between A and B. - if (_reciprocal == false) { - hitsFound = true; - numOverlaps++; - if (printable == true) - ReportOverlapDetail(overlapBases, a, *h, s, e); - } - // we require there to be sufficient __reciprocal__ overlap - else { - int bLength = (h->end - h->start); - float bOverlap = ( (float) overlapBases / (float) bLength ); - if (bOverlap >= _overlapFraction) { - hitsFound = true; - numOverlaps++; - if (printable == true) - ReportOverlapDetail(overlapBases, a, *h, s, e); - } - } - } - } - // report the summary of the overlaps if requested. - ReportOverlapSummary(a, numOverlaps); - // were hits found for this BED feature? - return hitsFound; + int aLength = (a.end - a.start); // the length of a in b.p. + + // loop through the hits and report those that meet the user's criteria + vector<BED>::const_iterator h = hits.begin(); + vector<BED>::const_iterator hitsEnd = hits.end(); + for (; h != hitsEnd; ++h) { + s = max(a.start, h->start); + e = min(a.end, h->end); + overlapBases = (e - s); // the number of overlapping bases b/w a and b + + // is there enough overlap relative to the user's request? (default ~ 1bp) + if ( ( (float) overlapBases / (float) aLength ) >= _overlapFraction ) { + // Report the hit if the user doesn't care about reciprocal overlap between A and B. + if (_reciprocal == false) { + hitsFound = true; + numOverlaps++; + if (printable == true) + ReportOverlapDetail(overlapBases, a, *h, s, e); + } + // we require there to be sufficient __reciprocal__ overlap + else { + int bLength = (h->end - h->start); + float bOverlap = ( (float) overlapBases / (float) bLength ); + if (bOverlap >= _overlapFraction) { + hitsFound = true; + numOverlaps++; + if (printable == true) + ReportOverlapDetail(overlapBases, a, *h, s, e); + } + } + } + } + // report the summary of the overlaps if requested. + ReportOverlapSummary(a, numOverlaps); + // were hits found for this BED feature? + return hitsFound; } /* - Constructor + Constructor */ -BedIntersect::BedIntersect(string bedAFile, string bedBFile, bool anyHit, - bool writeA, bool writeB, bool writeOverlap, bool writeAllOverlap, - float overlapFraction, bool noHit, bool writeCount, bool forceStrand, - bool reciprocal, bool obeySplits, bool bamInput, bool bamOutput, bool isUncompressedBam) { - - _bedAFile = bedAFile; - _bedBFile = bedBFile; - _anyHit = anyHit; - _noHit = noHit; - _writeA = writeA; - _writeB = writeB; - _writeOverlap = writeOverlap; - _writeAllOverlap = writeAllOverlap; - _writeCount = writeCount; - _overlapFraction = overlapFraction; - _forceStrand = forceStrand; - _reciprocal = reciprocal; +BedIntersect::BedIntersect(string bedAFile, string bedBFile, bool anyHit, + bool writeA, bool writeB, bool writeOverlap, bool writeAllOverlap, + float overlapFraction, bool noHit, bool writeCount, bool forceStrand, + bool reciprocal, bool obeySplits, bool bamInput, bool bamOutput, bool isUncompressedBam) { + + _bedAFile = bedAFile; + _bedBFile = bedBFile; + _anyHit = anyHit; + _noHit = noHit; + _writeA = writeA; + _writeB = writeB; + _writeOverlap = writeOverlap; + _writeAllOverlap = writeAllOverlap; + _writeCount = writeCount; + _overlapFraction = overlapFraction; + _forceStrand = forceStrand; + _reciprocal = reciprocal; _obeySplits = obeySplits; - _bamInput = bamInput; - _bamOutput = bamOutput; + _bamInput = bamInput; + _bamOutput = bamOutput; _isUncompressedBam = isUncompressedBam; - - // create new BED file objects for A and B - _bedA = new BedFile(bedAFile); - _bedB = new BedFile(bedBFile); - - if (_bamInput == false) - IntersectBed(); - else - IntersectBam(bedAFile); + + // create new BED file objects for A and B + _bedA = new BedFile(bedAFile); + _bedB = new BedFile(bedBFile); + + if (_bamInput == false) + IntersectBed(); + else + IntersectBam(bedAFile); } /* - Destructor + Destructor */ BedIntersect::~BedIntersect(void) { } bool BedIntersect::FindOverlaps(const BED &a, vector<BED> &hits) { - - bool hitsFound = false; - // should we print each overlap, or does the user want summary information? - bool printable = true; - if (_anyHit || _noHit || _writeCount) - printable = false; + bool hitsFound = false; + + // should we print each overlap, or does the user want summary information? + bool printable = true; + if (_anyHit || _noHit || _writeCount) + printable = false; - // collect and report the sufficient hits - _bedB->FindOverlapsPerBin(a.chrom, a.start, a.end, a.strand, hits, _forceStrand); + // collect and report the sufficient hits + _bedB->FindOverlapsPerBin(a.chrom, a.start, a.end, a.strand, hits, _forceStrand); hitsFound = processHits(a, hits, printable); return hitsFound; @@ -121,217 +121,217 @@ bool BedIntersect::FindOverlaps(const BED &a, vector<BED> &hits) { void BedIntersect::ReportOverlapDetail(const int &overlapBases, const BED &a, const BED &b, - const CHRPOS &s, const CHRPOS &e) { - // default. simple intersection only - if (_writeA == false && _writeB == false && _writeOverlap == false) { - _bedA->reportBedRangeNewLine(a,s,e); - } - // -wa -wbwrite the original A and B - else if (_writeA == true && _writeB == true) { - _bedA->reportBedTab(a); - _bedB->reportBedNewLine(b); - } - // -wa write just the original A - else if (_writeA == true) { - _bedA->reportBedNewLine(a); - } - // -wb write the intersected portion of A and the original B - else if (_writeB == true) { - _bedA->reportBedRangeTab(a,s,e); - _bedB->reportBedNewLine(b); - } - // -wo write the original A and B plus the no. of overlapping bases. - else if (_writeOverlap == true) { - _bedA->reportBedTab(a); - _bedB->reportBedTab(b); - printf("%d\n", overlapBases); - } + const CHRPOS &s, const CHRPOS &e) { + // default. simple intersection only + if (_writeA == false && _writeB == false && _writeOverlap == false) { + _bedA->reportBedRangeNewLine(a,s,e); + } + // -wa -wbwrite the original A and B + else if (_writeA == true && _writeB == true) { + _bedA->reportBedTab(a); + _bedB->reportBedNewLine(b); + } + // -wa write just the original A + else if (_writeA == true) { + _bedA->reportBedNewLine(a); + } + // -wb write the intersected portion of A and the original B + else if (_writeB == true) { + _bedA->reportBedRangeTab(a,s,e); + _bedB->reportBedNewLine(b); + } + // -wo write the original A and B plus the no. of overlapping bases. + else if (_writeOverlap == true) { + _bedA->reportBedTab(a); + _bedB->reportBedTab(b); + printf("%d\n", overlapBases); + } } void BedIntersect::ReportOverlapSummary(const BED &a, const int &numOverlapsFound) { - // -u just report the fact that there was >= 1 overlaps - if (_anyHit && (numOverlapsFound >= 1)) { - _bedA->reportBedNewLine(a); - } - // -c report the total number of features overlapped in B - else if (_writeCount) { - _bedA->reportBedTab(a); - printf("%d\n", numOverlapsFound); - } - // -v report iff there were no overlaps - else if (_noHit && (numOverlapsFound == 0)) { - _bedA->reportBedNewLine(a); - } - // -wao the user wants to force the reporting of 0 overlap - else if (_writeAllOverlap && (numOverlapsFound == 0)) { - _bedA->reportBedTab(a); - _bedB->reportNullBedTab(); - printf("0\n"); - } + // -u just report the fact that there was >= 1 overlaps + if (_anyHit && (numOverlapsFound >= 1)) { + _bedA->reportBedNewLine(a); + } + // -c report the total number of features overlapped in B + else if (_writeCount) { + _bedA->reportBedTab(a); + printf("%d\n", numOverlapsFound); + } + // -v report iff there were no overlaps + else if (_noHit && (numOverlapsFound == 0)) { + _bedA->reportBedNewLine(a); + } + // -wao the user wants to force the reporting of 0 overlap + else if (_writeAllOverlap && (numOverlapsFound == 0)) { + _bedA->reportBedTab(a); + _bedB->reportNullBedTab(); + printf("0\n"); + } } bool BedIntersect::FindOneOrMoreOverlap(const BED &a) { - bool overlapsFound; - if (_reciprocal == false) { - overlapsFound = _bedB->FindOneOrMoreOverlapsPerBin(a.chrom, a.start, a.end, a.strand, - _forceStrand, _overlapFraction); - } - else { - overlapsFound = _bedB->FindOneOrMoreReciprocalOverlapsPerBin(a.chrom, a.start, a.end, a.strand, - _forceStrand, _overlapFraction); - } - return overlapsFound; + bool overlapsFound; + if (_reciprocal == false) { + overlapsFound = _bedB->FindOneOrMoreOverlapsPerBin(a.chrom, a.start, a.end, a.strand, + _forceStrand, _overlapFraction); + } + else { + overlapsFound = _bedB->FindOneOrMoreReciprocalOverlapsPerBin(a.chrom, a.start, a.end, a.strand, + _forceStrand, _overlapFraction); + } + return overlapsFound; } - + void BedIntersect::IntersectBed() { - // load the "B" file into a map in order to - // compare each entry in A to it in search of overlaps. - _bedB->loadBedFileIntoMap(); - - int lineNum = 0; - vector<BED> hits; - hits.reserve(100); - BED a, nullBed; - BedLineStatus bedStatus; - - // open the "A" file, process each BED entry and searh for overlaps. - _bedA->Open(); - while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) { - if (bedStatus == BED_VALID) { - // treat the BED as a single "block" - if (_obeySplits == false) { - FindOverlaps(a, hits); - hits.clear(); - a = nullBed; - } - // split the BED12 into blocks and look for overlaps in each discrete block + // load the "B" file into a map in order to + // compare each entry in A to it in search of overlaps. + _bedB->loadBedFileIntoMap(); + + int lineNum = 0; + vector<BED> hits; + hits.reserve(100); + BED a, nullBed; + BedLineStatus bedStatus; + + // open the "A" file, process each BED entry and searh for overlaps. + _bedA->Open(); + while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) { + if (bedStatus == BED_VALID) { + // treat the BED as a single "block" + if (_obeySplits == false) { + FindOverlaps(a, hits); + hits.clear(); + a = nullBed; + } + // split the BED12 into blocks and look for overlaps in each discrete block else { bedVector bedBlocks; // vec to store the discrete BED "blocks" splitBedIntoBlocks(a, lineNum, bedBlocks); - + vector<BED>::const_iterator bedItr = bedBlocks.begin(); - vector<BED>::const_iterator bedEnd = bedBlocks.end(); - for (; bedItr != bedEnd; ++bedItr) { - FindOverlaps(*bedItr, hits); + vector<BED>::const_iterator bedEnd = bedBlocks.end(); + for (; bedItr != bedEnd; ++bedItr) { + FindOverlaps(*bedItr, hits); hits.clear(); - } - a = nullBed; + } + a = nullBed; } - } - } - _bedA->Close(); + } + } + _bedA->Close(); } void BedIntersect::IntersectBam(string bamFile) { - // load the "B" bed file into a map so - // that we can easily compare "A" to it for overlaps - _bedB->loadBedFileIntoMap(); - - // open the BAM file - BamReader reader; - BamWriter writer; - reader.Open(bamFile); - - // get header & reference information - string header = reader.GetHeaderText(); - RefVector refs = reader.GetReferenceData(); - - // open a BAM output to stdout if we are writing BAM - if (_bamOutput == true) { - // open our BAM writer + // load the "B" bed file into a map so + // that we can easily compare "A" to it for overlaps + _bedB->loadBedFileIntoMap(); + + // open the BAM file + BamReader reader; + BamWriter writer; + reader.Open(bamFile); + + // get header & reference information + string header = reader.GetHeaderText(); + RefVector refs = reader.GetReferenceData(); + + // open a BAM output to stdout if we are writing BAM + if (_bamOutput == true) { + // open our BAM writer writer.Open("stdout", header, refs, _isUncompressedBam); - } - - vector<BED> hits; - // reserve some space - hits.reserve(100); - - _bedA->bedType = 6; - BamAlignment bam; - // get each set of alignments for each pair. - while (reader.GetNextAlignment(bam)) { - - if (bam.IsMapped()) { - BED a; - a.chrom = refs.at(bam.RefID).RefName; - a.start = bam.Position; - a.end = bam.GetEndPosition(false); - - // build the name field from the BAM alignment. - a.name = bam.Name; - if (bam.IsFirstMate()) a.name += "/1"; - if (bam.IsSecondMate()) a.name += "/2"; - - a.score = ToString(bam.MapQuality); - - a.strand = "+"; - if (bam.IsReverseStrand()) a.strand = "-"; - - if (_bamOutput == true) { - bool overlapsFound = false; - // treat the BAM alignment as a single "block" - if (_obeySplits == false) { - overlapsFound = FindOneOrMoreOverlap(a); - } - // split the BAM alignment into discrete blocks and - // look for overlaps only within each block. - else { + } + + vector<BED> hits; + // reserve some space + hits.reserve(100); + + _bedA->bedType = 6; + BamAlignment bam; + // get each set of alignments for each pair. + while (reader.GetNextAlignment(bam)) { + + if (bam.IsMapped()) { + BED a; + a.chrom = refs.at(bam.RefID).RefName; + a.start = bam.Position; + a.end = bam.GetEndPosition(false); + + // build the name field from the BAM alignment. + a.name = bam.Name; + if (bam.IsFirstMate()) a.name += "/1"; + if (bam.IsSecondMate()) a.name += "/2"; + + a.score = ToString(bam.MapQuality); + + a.strand = "+"; + if (bam.IsReverseStrand()) a.strand = "-"; + + if (_bamOutput == true) { + bool overlapsFound = false; + // treat the BAM alignment as a single "block" + if (_obeySplits == false) { + overlapsFound = FindOneOrMoreOverlap(a); + } + // split the BAM alignment into discrete blocks and + // look for overlaps only within each block. + else { bool overlapFoundForBlock; - bedVector bedBlocks; // vec to store the discrete BED "blocks" from a - // we don't want to split on "D" ops, hence the "false" + bedVector bedBlocks; // vec to store the discrete BED "blocks" from a + // we don't want to split on "D" ops, hence the "false" getBamBlocks(bam, refs, bedBlocks, false); - + vector<BED>::const_iterator bedItr = bedBlocks.begin(); - vector<BED>::const_iterator bedEnd = bedBlocks.end(); - for (; bedItr != bedEnd; ++bedItr) { - overlapFoundForBlock = FindOneOrMoreOverlap(a); - if (overlapFoundForBlock == true) + vector<BED>::const_iterator bedEnd = bedBlocks.end(); + for (; bedItr != bedEnd; ++bedItr) { + overlapFoundForBlock = FindOneOrMoreOverlap(a); + if (overlapFoundForBlock == true) overlapsFound = true; - } - } - if (overlapsFound == true) { - if (_noHit == false) - writer.SaveAlignment(bam); - } - else { - if (_noHit == true) { - writer.SaveAlignment(bam); - } - } - } - else { - // treat the BAM alignment as a single BED "block" - if (_obeySplits == false) { - FindOverlaps(a, hits); - hits.clear(); - } - // split the BAM alignment into discrete BED blocks and - // look for overlaps only within each block. - else { - bedVector bedBlocks; // vec to store the discrete BED "blocks" from a + } + } + if (overlapsFound == true) { + if (_noHit == false) + writer.SaveAlignment(bam); + } + else { + if (_noHit == true) { + writer.SaveAlignment(bam); + } + } + } + else { + // treat the BAM alignment as a single BED "block" + if (_obeySplits == false) { + FindOverlaps(a, hits); + hits.clear(); + } + // split the BAM alignment into discrete BED blocks and + // look for overlaps only within each block. + else { + bedVector bedBlocks; // vec to store the discrete BED "blocks" from a getBamBlocks(bam, refs, bedBlocks, false); vector<BED>::const_iterator bedItr = bedBlocks.begin(); - vector<BED>::const_iterator bedEnd = bedBlocks.end(); - for (; bedItr != bedEnd; ++bedItr) { - FindOverlaps(*bedItr, hits); + vector<BED>::const_iterator bedEnd = bedBlocks.end(); + for (; bedItr != bedEnd; ++bedItr) { + FindOverlaps(*bedItr, hits); hits.clear(); - } - } - } - } - } - - // close the relevant BAM files. - reader.Close(); - if (_bamOutput == true) { - writer.Close(); - } + } + } + } + } + } + + // close the relevant BAM files. + reader.Close(); + if (_bamOutput == true) { + writer.Close(); + } } diff --git a/src/intersectBed/intersectBed.h b/src/intersectBed/intersectBed.h index b7524f2797149a91bbe0ca2f611d5fcf67fe7044..a24cf56836a21f47411c9aa62f7c8c56ba26ca22 100644 --- a/src/intersectBed/intersectBed.h +++ b/src/intersectBed/intersectBed.h @@ -32,62 +32,62 @@ class BedIntersect { public: - // constructor - BedIntersect(string bedAFile, string bedBFile, bool anyHit, - bool writeA, bool writeB, bool writeOverlap, bool writeAllOverlap, - float overlapFraction, bool noHit, bool writeCount, bool forceStrand, - bool reciprocal, bool obeySplits, bool bamInput, bool bamOutput, bool isUncompressedBam); - - // destructor - ~BedIntersect(void); - + // constructor + BedIntersect(string bedAFile, string bedBFile, bool anyHit, + bool writeA, bool writeB, bool writeOverlap, bool writeAllOverlap, + float overlapFraction, bool noHit, bool writeCount, bool forceStrand, + bool reciprocal, bool obeySplits, bool bamInput, bool bamOutput, bool isUncompressedBam); + + // destructor + ~BedIntersect(void); + private: - - //------------------------------------------------ - // private attributes - //------------------------------------------------ - string _bedAFile; - string _bedBFile; - - bool _writeA; // should the original A feature be reported? - bool _writeB; // should the original B feature be reported? - bool _writeOverlap; - bool _writeAllOverlap; - - bool _forceStrand; - bool _reciprocal; - float _overlapFraction; - - bool _anyHit; - bool _noHit; - bool _writeCount; // do we want a count of the number of overlaps in B? + + //------------------------------------------------ + // private attributes + //------------------------------------------------ + string _bedAFile; + string _bedBFile; + + bool _writeA; // should the original A feature be reported? + bool _writeB; // should the original B feature be reported? + bool _writeOverlap; + bool _writeAllOverlap; + + bool _forceStrand; + bool _reciprocal; + float _overlapFraction; + + bool _anyHit; + bool _noHit; + bool _writeCount; // do we want a count of the number of overlaps in B? bool _obeySplits; - bool _bamInput; - bool _bamOutput; + bool _bamInput; + bool _bamOutput; bool _isUncompressedBam; - - // instance of a bed file class. - BedFile *_bedA, *_bedB; - //------------------------------------------------ - // private methods - //------------------------------------------------ - void IntersectBed(istream &bedInput); + // instance of a bed file class. + BedFile *_bedA, *_bedB; - void IntersectBed(); + //------------------------------------------------ + // private methods + //------------------------------------------------ + void IntersectBed(istream &bedInput); + + void IntersectBed(); + + void IntersectBam(string bamFile); - void IntersectBam(string bamFile); - bool processHits(const BED &a, const vector<BED> &hits, bool printable); - - bool FindOverlaps(const BED &a, vector<BED> &hits); - - bool FindOneOrMoreOverlap(const BED &a); - - void ReportOverlapDetail(const int &overlapBases, const BED &a, const BED &b, - const CHRPOS &s, const CHRPOS &e); - void ReportOverlapSummary(const BED &a, const int &numOverlapsFound); - + + bool FindOverlaps(const BED &a, vector<BED> &hits); + + bool FindOneOrMoreOverlap(const BED &a); + + void ReportOverlapDetail(const int &overlapBases, const BED &a, const BED &b, + const CHRPOS &s, const CHRPOS &e); + void ReportOverlapSummary(const BED &a, const int &numOverlapsFound); + }; #endif /* INTERSECTBED_H */ diff --git a/src/intersectBed/intersectMain.cpp b/src/intersectBed/intersectMain.cpp index 42cb38a1812f278e53f9636b173b0142795a2d36..e6d35cd5c03b339cc5f811323ab82bd7918f2368 100644 --- a/src/intersectBed/intersectMain.cpp +++ b/src/intersectBed/intersectMain.cpp @@ -26,251 +26,251 @@ void ShowHelp(void); int main(int argc, char* argv[]) { - // our configuration variables - bool showHelp = false; - - // input files - string bedAFile; - string bedBFile; - - // input arguments - float overlapFraction = 1E-9; - - bool haveBedA = false; - bool haveBedB = false; - bool noHit = false; - bool anyHit = false; - bool writeA = false; - bool writeB = false; - bool writeCount = false; - bool writeOverlap = false; - bool writeAllOverlap = false; - bool haveFraction = false; - bool reciprocalFraction = false; - bool forceStrand = false; + // our configuration variables + bool showHelp = false; + + // input files + string bedAFile; + string bedBFile; + + // input arguments + float overlapFraction = 1E-9; + + bool haveBedA = false; + bool haveBedB = false; + bool noHit = false; + bool anyHit = false; + bool writeA = false; + bool writeB = false; + bool writeCount = false; + bool writeOverlap = false; + bool writeAllOverlap = false; + bool haveFraction = false; + bool reciprocalFraction = false; + bool forceStrand = false; bool obeySplits = false; - bool inputIsBam = false; - bool outputIsBam = true; - bool uncompressedBam = false; - // check to see if we should print out some help - if(argc <= 1) showHelp = true; - - for(int i = 1; i < argc; i++) { - int parameterLength = (int)strlen(argv[i]); - - if((PARAMETER_CHECK("-h", 2, parameterLength)) || - (PARAMETER_CHECK("--help", 5, parameterLength))) { - showHelp = true; - } - } - - if(showHelp) ShowHelp(); - - // do some parsing (all of these parameters require 2 strings) - for(int i = 1; i < argc; i++) { - - int parameterLength = (int)strlen(argv[i]); - - if(PARAMETER_CHECK("-a", 2, parameterLength)) { - if ((i+1) < argc) { - haveBedA = true; - outputIsBam = false; - bedAFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-abam", 5, parameterLength)) { - if ((i+1) < argc) { - haveBedA = true; - inputIsBam = true; - bedAFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-b", 2, parameterLength)) { - if ((i+1) < argc) { - haveBedB = true; - bedBFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-bed", 4, parameterLength)) { - outputIsBam = false; - } - else if(PARAMETER_CHECK("-u", 2, parameterLength)) { - anyHit = true; - } - else if(PARAMETER_CHECK("-f", 2, parameterLength)) { - if ((i+1) < argc) { - haveFraction = true; - overlapFraction = atof(argv[i + 1]); - i++; - } - } - else if(PARAMETER_CHECK("-wa", 3, parameterLength)) { - writeA = true; - } - else if(PARAMETER_CHECK("-wb", 3, parameterLength)) { - writeB = true; - } - else if(PARAMETER_CHECK("-wo", 3, parameterLength)) { - writeOverlap = true; - } - else if(PARAMETER_CHECK("-wao", 4, parameterLength)) { - writeAllOverlap = true; - writeOverlap = true; - } - else if(PARAMETER_CHECK("-c", 2, parameterLength)) { - writeCount = true; - } - else if(PARAMETER_CHECK("-r", 2, parameterLength)) { - reciprocalFraction = true; - } - else if (PARAMETER_CHECK("-v", 2, parameterLength)) { - noHit = true; - } - else if (PARAMETER_CHECK("-s", 2, parameterLength)) { - forceStrand = true; - } - else if (PARAMETER_CHECK("-split", 6, parameterLength)) { - obeySplits = true; - } - else if(PARAMETER_CHECK("-ubam", 5, parameterLength)) { + bool inputIsBam = false; + bool outputIsBam = true; + bool uncompressedBam = false; + // check to see if we should print out some help + if(argc <= 1) showHelp = true; + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if((PARAMETER_CHECK("-h", 2, parameterLength)) || + (PARAMETER_CHECK("--help", 5, parameterLength))) { + showHelp = true; + } + } + + if(showHelp) ShowHelp(); + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-a", 2, parameterLength)) { + if ((i+1) < argc) { + haveBedA = true; + outputIsBam = false; + bedAFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-abam", 5, parameterLength)) { + if ((i+1) < argc) { + haveBedA = true; + inputIsBam = true; + bedAFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-b", 2, parameterLength)) { + if ((i+1) < argc) { + haveBedB = true; + bedBFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-bed", 4, parameterLength)) { + outputIsBam = false; + } + else if(PARAMETER_CHECK("-u", 2, parameterLength)) { + anyHit = true; + } + else if(PARAMETER_CHECK("-f", 2, parameterLength)) { + if ((i+1) < argc) { + haveFraction = true; + overlapFraction = atof(argv[i + 1]); + i++; + } + } + else if(PARAMETER_CHECK("-wa", 3, parameterLength)) { + writeA = true; + } + else if(PARAMETER_CHECK("-wb", 3, parameterLength)) { + writeB = true; + } + else if(PARAMETER_CHECK("-wo", 3, parameterLength)) { + writeOverlap = true; + } + else if(PARAMETER_CHECK("-wao", 4, parameterLength)) { + writeAllOverlap = true; + writeOverlap = true; + } + else if(PARAMETER_CHECK("-c", 2, parameterLength)) { + writeCount = true; + } + else if(PARAMETER_CHECK("-r", 2, parameterLength)) { + reciprocalFraction = true; + } + else if (PARAMETER_CHECK("-v", 2, parameterLength)) { + noHit = true; + } + else if (PARAMETER_CHECK("-s", 2, parameterLength)) { + forceStrand = true; + } + else if (PARAMETER_CHECK("-split", 6, parameterLength)) { + obeySplits = true; + } + else if(PARAMETER_CHECK("-ubam", 5, parameterLength)) { uncompressedBam = true; - } - else { - cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; - showHelp = true; - } - } - - // make sure we have both input files - if (!haveBedA || !haveBedB) { - cerr << endl << "*****" << endl << "*****ERROR: Need -a and -b files. " << endl << "*****" << endl; - showHelp = true; - } - - if (anyHit && noHit) { - cerr << endl << "*****" << endl << "*****ERROR: Request either -u OR -v, not both." << endl << "*****" << endl; - showHelp = true; - } - - if (writeB && writeCount) { - cerr << endl << "*****" << endl << "*****ERROR: Request either -wb OR -c, not both." << endl << "*****" << endl; - showHelp = true; - } - - if (writeCount && writeOverlap) { - cerr << endl << "*****" << endl << "*****ERROR: Request either -wb OR -wo, not both." << endl << "*****" << endl; - showHelp = true; - } - - if (writeA && writeOverlap) { - cerr << endl << "*****" << endl << "*****ERROR: Request either -wa OR -wo, not both." << endl << "*****" << endl; - showHelp = true; - } - - if (writeB && writeOverlap) { - cerr << endl << "*****" << endl << "*****ERROR: Request either -wb OR -wo, not both." << endl << "*****" << endl; - showHelp = true; - } - - if (reciprocalFraction && !haveFraction) { - cerr << endl << "*****" << endl << "*****ERROR: If using -r, you need to define -f." << endl << "*****" << endl; - showHelp = true; - } - - if (anyHit && writeCount) { - cerr << endl << "*****" << endl << "*****ERROR: Request either -u OR -c, not both." << endl << "*****" << endl; - showHelp = true; - } - - if (anyHit && writeB) { - cerr << endl << "*****" << endl << "*****ERROR: Request either -u OR -wb, not both." << endl << "*****" << endl; - showHelp = true; - } - - if (anyHit && writeOverlap) { - cerr << endl << "*****" << endl << "*****ERROR: Request either -u OR -wo, not both." << endl << "*****" << endl; - showHelp = true; - } - - - if (!showHelp) { - - BedIntersect *bi = new BedIntersect(bedAFile, bedBFile, anyHit, writeA, writeB, writeOverlap, - writeAllOverlap, overlapFraction, noHit, writeCount, forceStrand, - reciprocalFraction, obeySplits, inputIsBam, outputIsBam, uncompressedBam); - delete bi; - return 0; - } - else { - ShowHelp(); - } + } + else { + cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; + showHelp = true; + } + } + + // make sure we have both input files + if (!haveBedA || !haveBedB) { + cerr << endl << "*****" << endl << "*****ERROR: Need -a and -b files. " << endl << "*****" << endl; + showHelp = true; + } + + if (anyHit && noHit) { + cerr << endl << "*****" << endl << "*****ERROR: Request either -u OR -v, not both." << endl << "*****" << endl; + showHelp = true; + } + + if (writeB && writeCount) { + cerr << endl << "*****" << endl << "*****ERROR: Request either -wb OR -c, not both." << endl << "*****" << endl; + showHelp = true; + } + + if (writeCount && writeOverlap) { + cerr << endl << "*****" << endl << "*****ERROR: Request either -wb OR -wo, not both." << endl << "*****" << endl; + showHelp = true; + } + + if (writeA && writeOverlap) { + cerr << endl << "*****" << endl << "*****ERROR: Request either -wa OR -wo, not both." << endl << "*****" << endl; + showHelp = true; + } + + if (writeB && writeOverlap) { + cerr << endl << "*****" << endl << "*****ERROR: Request either -wb OR -wo, not both." << endl << "*****" << endl; + showHelp = true; + } + + if (reciprocalFraction && !haveFraction) { + cerr << endl << "*****" << endl << "*****ERROR: If using -r, you need to define -f." << endl << "*****" << endl; + showHelp = true; + } + + if (anyHit && writeCount) { + cerr << endl << "*****" << endl << "*****ERROR: Request either -u OR -c, not both." << endl << "*****" << endl; + showHelp = true; + } + + if (anyHit && writeB) { + cerr << endl << "*****" << endl << "*****ERROR: Request either -u OR -wb, not both." << endl << "*****" << endl; + showHelp = true; + } + + if (anyHit && writeOverlap) { + cerr << endl << "*****" << endl << "*****ERROR: Request either -u OR -wo, not both." << endl << "*****" << endl; + showHelp = true; + } + + + if (!showHelp) { + + BedIntersect *bi = new BedIntersect(bedAFile, bedBFile, anyHit, writeA, writeB, writeOverlap, + writeAllOverlap, overlapFraction, noHit, writeCount, forceStrand, + reciprocalFraction, obeySplits, inputIsBam, outputIsBam, uncompressedBam); + delete bi; + return 0; + } + else { + ShowHelp(); + } } void ShowHelp(void) { - cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; - - cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; - - cerr << "Summary: Report overlaps between two feature files." << endl << endl; - - cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <bed/gff/vcf> -b <bed/gff/vcf>" << endl << endl; - - cerr << "Options: " << endl; - - cerr << "\t-abam\t" << "The A input file is in BAM format. Output will be BAM as well." << endl << endl; - - cerr << "\t-ubam\t" << "Write uncompressed BAM output. Default is to write compressed BAM." << endl << endl; - - cerr << "\t-bed\t" << "When using BAM input (-abam), write output as BED. The default" << endl; - cerr << "\t\tis to write output in BAM when using -abam." << endl << endl; - - cerr << "\t-wa\t" << "Write the original entry in A for each overlap." << endl << endl; - - cerr << "\t-wb\t" << "Write the original entry in B for each overlap." << endl; - cerr << "\t\t- Useful for knowing _what_ A overlaps. Restricted by -f and -r." << endl << endl; - - cerr << "\t-wo\t" << "Write the original A and B entries plus the number of base" << endl; - cerr << "\t\tpairs of overlap between the two features." << endl; - cerr << "\t\t- Overlaps restricted by -f and -r." << endl; - cerr << "\t\t Only A features with overlap are reported." << endl << endl; - - cerr << "\t-wao\t" << "Write the original A and B entries plus the number of base" << endl; - cerr << "\t\tpairs of overlap between the two features." << endl; - cerr << "\t\t- Overlapping features restricted by -f and -r." << endl; - cerr << "\t\t However, A features w/o overlap are also reported" << endl; - cerr << "\t\t with a NULL B feature and overlap = 0." << endl << endl; - - cerr << "\t-u\t" << "Write the original A entry _once_ if _any_ overlaps found in B." << endl; - cerr << "\t\t- In other words, just report the fact >=1 hit was found." << endl; - cerr << "\t\t- Overlaps restricted by -f and -r." << endl << endl; - - cerr << "\t-c\t" << "For each entry in A, report the number of overlaps with B." << endl; - cerr << "\t\t- Reports 0 for A entries that have no overlap with B." << endl; - cerr << "\t\t- Overlaps restricted by -f and -r." << endl << endl; - - cerr << "\t-v\t" << "Only report those entries in A that have _no overlaps_ with B." << endl; - cerr << "\t\t- Similar to \"grep -v\" (an homage)." << endl << endl; - - cerr << "\t-f\t" << "Minimum overlap required as a fraction of A." << endl; - cerr << "\t\t- Default is 1E-9 (i.e., 1bp)." << endl; - cerr << "\t\t- FLOAT (e.g. 0.50)" << endl << endl; - - cerr << "\t-r\t" << "Require that the fraction overlap be reciprocal for A and B." << endl; - cerr << "\t\t- In other words, if -f is 0.90 and -r is used, this requires" << endl; - cerr << "\t\t that B overlap 90% of A and A _also_ overlaps 90% of B." << endl << endl; - - cerr << "\t-s\t" << "Force strandedness. That is, only report hits in B that" << endl; - cerr << "\t\toverlap A on the same strand." << endl; - cerr << "\t\t- By default, overlaps are reported without respect to strand." << endl << endl; - - cerr << "\t-split\t" << "Treat \"split\" BAM or BED12 entries as distinct BED intervals." << endl << endl; - - - // end the program here - exit(1); + cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; + + cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; + + cerr << "Summary: Report overlaps between two feature files." << endl << endl; + + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <bed/gff/vcf> -b <bed/gff/vcf>" << endl << endl; + + cerr << "Options: " << endl; + + cerr << "\t-abam\t" << "The A input file is in BAM format. Output will be BAM as well." << endl << endl; + + cerr << "\t-ubam\t" << "Write uncompressed BAM output. Default is to write compressed BAM." << endl << endl; + + cerr << "\t-bed\t" << "When using BAM input (-abam), write output as BED. The default" << endl; + cerr << "\t\tis to write output in BAM when using -abam." << endl << endl; + + cerr << "\t-wa\t" << "Write the original entry in A for each overlap." << endl << endl; + + cerr << "\t-wb\t" << "Write the original entry in B for each overlap." << endl; + cerr << "\t\t- Useful for knowing _what_ A overlaps. Restricted by -f and -r." << endl << endl; + + cerr << "\t-wo\t" << "Write the original A and B entries plus the number of base" << endl; + cerr << "\t\tpairs of overlap between the two features." << endl; + cerr << "\t\t- Overlaps restricted by -f and -r." << endl; + cerr << "\t\t Only A features with overlap are reported." << endl << endl; + + cerr << "\t-wao\t" << "Write the original A and B entries plus the number of base" << endl; + cerr << "\t\tpairs of overlap between the two features." << endl; + cerr << "\t\t- Overlapping features restricted by -f and -r." << endl; + cerr << "\t\t However, A features w/o overlap are also reported" << endl; + cerr << "\t\t with a NULL B feature and overlap = 0." << endl << endl; + + cerr << "\t-u\t" << "Write the original A entry _once_ if _any_ overlaps found in B." << endl; + cerr << "\t\t- In other words, just report the fact >=1 hit was found." << endl; + cerr << "\t\t- Overlaps restricted by -f and -r." << endl << endl; + + cerr << "\t-c\t" << "For each entry in A, report the number of overlaps with B." << endl; + cerr << "\t\t- Reports 0 for A entries that have no overlap with B." << endl; + cerr << "\t\t- Overlaps restricted by -f and -r." << endl << endl; + + cerr << "\t-v\t" << "Only report those entries in A that have _no overlaps_ with B." << endl; + cerr << "\t\t- Similar to \"grep -v\" (an homage)." << endl << endl; + + cerr << "\t-f\t" << "Minimum overlap required as a fraction of A." << endl; + cerr << "\t\t- Default is 1E-9 (i.e., 1bp)." << endl; + cerr << "\t\t- FLOAT (e.g. 0.50)" << endl << endl; + + cerr << "\t-r\t" << "Require that the fraction overlap be reciprocal for A and B." << endl; + cerr << "\t\t- In other words, if -f is 0.90 and -r is used, this requires" << endl; + cerr << "\t\t that B overlap 90% of A and A _also_ overlaps 90% of B." << endl << endl; + + cerr << "\t-s\t" << "Force strandedness. That is, only report hits in B that" << endl; + cerr << "\t\toverlap A on the same strand." << endl; + cerr << "\t\t- By default, overlaps are reported without respect to strand." << endl << endl; + + cerr << "\t-split\t" << "Treat \"split\" BAM or BED12 entries as distinct BED intervals." << endl << endl; + + + // end the program here + exit(1); } diff --git a/src/linksBed/linksBed.cpp b/src/linksBed/linksBed.cpp index 3c0173e0ad6cf613d57d029ee0401ada4ad5e4ba..bf1a74c8891ad606f9e3e283ec229a38abd5824f 100644 --- a/src/linksBed/linksBed.cpp +++ b/src/linksBed/linksBed.cpp @@ -16,14 +16,14 @@ // Constructor // BedLinks::BedLinks(string &bedFile, string &base, string &org, string &db) { - _bedFile = bedFile; - _bed = new BedFile(bedFile); - - _base = base; - _org = org; - _db = db; - - CreateLinks(); + _bedFile = bedFile; + _bed = new BedFile(bedFile); + + _base = base; + _org = org; + _db = db; + + CreateLinks(); } // @@ -35,88 +35,88 @@ BedLinks::~BedLinks(void) { void BedLinks::WriteURL(BED &bed, string &base) { - string position = bed.chrom; - std::stringstream posStream; - posStream << ":" << bed.start << "-" << bed.end; - position.append(posStream.str()); - - cout << "<tr>" << endl; - cout << "\t<td>" << endl; - cout << "\t\t<a href=" << base << position << ">"; - cout << bed.chrom << ":" << bed.start << "-" << bed.end; - cout << "</a>" << endl; - cout << "\t</td>" << endl; - - if (_bed->bedType == 4) { - cout << "\t<td>" << endl; - cout << bed.name << endl; - cout << "\t</td>" << endl; - } - else if (_bed->bedType == 5) { - cout << "\t<td>" << endl; - cout << bed.name << endl; - cout << "\t</td>" << endl; - - cout << "\t<td>" << endl; - cout << bed.score << endl; - cout << "\t</td>" << endl; - } - else if ((_bed->bedType == 6) || (_bed->bedType == 9) || (_bed->bedType == 12)) { - cout << "\t<td>" << endl; - cout << bed.name << endl; - cout << "\t</td>" << endl; - - cout << "\t<td>" << endl; - cout << bed.score << endl; - cout << "\t</td>" << endl; - - cout << "\t<td>" << endl; - cout << bed.strand << endl; - cout << "\t</td>" << endl; - } - cout << "</tr>" << endl; + string position = bed.chrom; + std::stringstream posStream; + posStream << ":" << bed.start << "-" << bed.end; + position.append(posStream.str()); + + cout << "<tr>" << endl; + cout << "\t<td>" << endl; + cout << "\t\t<a href=" << base << position << ">"; + cout << bed.chrom << ":" << bed.start << "-" << bed.end; + cout << "</a>" << endl; + cout << "\t</td>" << endl; + + if (_bed->bedType == 4) { + cout << "\t<td>" << endl; + cout << bed.name << endl; + cout << "\t</td>" << endl; + } + else if (_bed->bedType == 5) { + cout << "\t<td>" << endl; + cout << bed.name << endl; + cout << "\t</td>" << endl; + + cout << "\t<td>" << endl; + cout << bed.score << endl; + cout << "\t</td>" << endl; + } + else if ((_bed->bedType == 6) || (_bed->bedType == 9) || (_bed->bedType == 12)) { + cout << "\t<td>" << endl; + cout << bed.name << endl; + cout << "\t</td>" << endl; + + cout << "\t<td>" << endl; + cout << bed.score << endl; + cout << "\t</td>" << endl; + + cout << "\t<td>" << endl; + cout << bed.strand << endl; + cout << "\t</td>" << endl; + } + cout << "</tr>" << endl; } void BedLinks::CreateLinks() { - // construct the html base. - string org = _org; - string db = _db; - string base = _base; - base.append("/cgi-bin/hgTracks?org="); - base.append(org); - base.append("&db="); - base.append(db); - base.append("&position="); - - // create the HTML header - cout << "<html>" << endl <<"\t<body>" << endl; - cout << "<title>" << _bedFile << "</title>" << endl; - - // start the table of entries - cout << "<br>Firefox users: Press and hold the \"apple\" or \"alt\" key and click link to open in new tab." << endl; - cout << "<p style=\"font-family:courier\">" << endl; - cout << "<table border=\"0\" align=\"justify\"" << endl; - cout << "<h3>BED Entries from: stdin </h3>" << endl; - - int lineNum = 0; - BED bedEntry, nullBed; - BedLineStatus bedStatus; - - _bed->Open(); - while ((bedStatus = _bed->GetNextBed(bedEntry, lineNum)) != BED_INVALID) { - if (bedStatus == BED_VALID) { - WriteURL(bedEntry, base); - bedEntry = nullBed; - } - } - _bed->Close(); - - cout << "</table>" << endl; - cout << "</p>" << endl; - cout << "\t</body>" << endl <<"</html>" << endl; + // construct the html base. + string org = _org; + string db = _db; + string base = _base; + base.append("/cgi-bin/hgTracks?org="); + base.append(org); + base.append("&db="); + base.append(db); + base.append("&position="); + + // create the HTML header + cout << "<html>" << endl <<"\t<body>" << endl; + cout << "<title>" << _bedFile << "</title>" << endl; + + // start the table of entries + cout << "<br>Firefox users: Press and hold the \"apple\" or \"alt\" key and click link to open in new tab." << endl; + cout << "<p style=\"font-family:courier\">" << endl; + cout << "<table border=\"0\" align=\"justify\"" << endl; + cout << "<h3>BED Entries from: stdin </h3>" << endl; + + int lineNum = 0; + BED bedEntry, nullBed; + BedLineStatus bedStatus; + + _bed->Open(); + while ((bedStatus = _bed->GetNextBed(bedEntry, lineNum)) != BED_INVALID) { + if (bedStatus == BED_VALID) { + WriteURL(bedEntry, base); + bedEntry = nullBed; + } + } + _bed->Close(); + + cout << "</table>" << endl; + cout << "</p>" << endl; + cout << "\t</body>" << endl <<"</html>" << endl; } diff --git a/src/linksBed/linksBed.h b/src/linksBed/linksBed.h index 4d812239419d91eb1fd37b651464778914bf3f0b..941c2e9290329976c17d0b810018eb4543c42a22 100644 --- a/src/linksBed/linksBed.h +++ b/src/linksBed/linksBed.h @@ -24,21 +24,21 @@ class BedLinks { public: - // constructor - BedLinks(string &bedFile, string &base, string &org, string &db); - - // destructor - ~BedLinks(void); - -private: - string _bedFile; - string _base; - string _org; - string _db; - - // instance of a bed file class. - BedFile *_bed; - - void WriteURL(BED &bed, string &base); - void CreateLinks(); // the default. sorts by chrom (asc.) then by start (asc.) + // constructor + BedLinks(string &bedFile, string &base, string &org, string &db); + + // destructor + ~BedLinks(void); + +private: + string _bedFile; + string _base; + string _org; + string _db; + + // instance of a bed file class. + BedFile *_bed; + + void WriteURL(BED &bed, string &base); + void CreateLinks(); // the default. sorts by chrom (asc.) then by start (asc.) }; diff --git a/src/linksBed/linksMain.cpp b/src/linksBed/linksMain.cpp index 2bc1822d62fb0b60593dd891148fe80505c98168..f4e5500d68686694b9262285f43d9608bf05828c 100644 --- a/src/linksBed/linksMain.cpp +++ b/src/linksBed/linksMain.cpp @@ -26,104 +26,104 @@ void ShowHelp(void); int main(int argc, char* argv[]) { - // our configuration variables - bool showHelp = false; - - // input files - string bedFile = "stdin"; - bool haveBed = true; - - /* Defaults for everyone else */ - string org = "human"; - string db = "hg18"; - string base = "http://genome.ucsc.edu"; - - for(int i = 1; i < argc; i++) { - int parameterLength = (int)strlen(argv[i]); - - if((PARAMETER_CHECK("-h", 2, parameterLength)) || - (PARAMETER_CHECK("--help", 5, parameterLength))) { - showHelp = true; - } - } - - if(showHelp) ShowHelp(); - - // do some parsing (all of these parameters require 2 strings) - for(int i = 1; i < argc; i++) { - - int parameterLength = (int)strlen(argv[i]); - - if(PARAMETER_CHECK("-i", 2, parameterLength)) { - if ((i+1) < argc) { - bedFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-base", 5, parameterLength)) { - if ((i+1) < argc) { - base = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-org", 4, parameterLength)) { - if ((i+1) < argc) { - org = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-db", 3, parameterLength)) { - if ((i+1) < argc) { - db = argv[i + 1]; - i++; - } - } - else { - cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; - showHelp = true; - } - } - - // make sure we have both input files - if (!haveBed) { - cerr << endl << "*****" << endl << "*****ERROR: Need -i BED file. " << endl << "*****" << endl; - showHelp = true; - } - - if (!showHelp) { - BedLinks *bl = new BedLinks(bedFile, base, org, db); - delete bl; - return 0; - } - else { - ShowHelp(); - } + // our configuration variables + bool showHelp = false; + + // input files + string bedFile = "stdin"; + bool haveBed = true; + + /* Defaults for everyone else */ + string org = "human"; + string db = "hg18"; + string base = "http://genome.ucsc.edu"; + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if((PARAMETER_CHECK("-h", 2, parameterLength)) || + (PARAMETER_CHECK("--help", 5, parameterLength))) { + showHelp = true; + } + } + + if(showHelp) ShowHelp(); + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-i", 2, parameterLength)) { + if ((i+1) < argc) { + bedFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-base", 5, parameterLength)) { + if ((i+1) < argc) { + base = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-org", 4, parameterLength)) { + if ((i+1) < argc) { + org = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-db", 3, parameterLength)) { + if ((i+1) < argc) { + db = argv[i + 1]; + i++; + } + } + else { + cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; + showHelp = true; + } + } + + // make sure we have both input files + if (!haveBed) { + cerr << endl << "*****" << endl << "*****ERROR: Need -i BED file. " << endl << "*****" << endl; + showHelp = true; + } + + if (!showHelp) { + BedLinks *bl = new BedLinks(bedFile, base, org, db); + delete bl; + return 0; + } + else { + ShowHelp(); + } } void ShowHelp(void) { - cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; - - cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; - - cerr << "Summary: Creates HTML links to an UCSC Genome Browser from a feature file." << endl << endl; - cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bed/gff/vcf> > out.html" << endl << endl; - - cerr << "Options: " << endl; - cerr << "\t-base\t" << "The browser basename. Default: http://genome.ucsc.edu " << endl; - cerr << "\t-org\t" << "The organism. Default: human" << endl; - cerr << "\t-db\t" << "The build. Default: hg18" << endl << endl; - - cerr << "Example: " << endl; - cerr << "\t" << "By default, the links created will point to human (hg18) UCSC browser." << endl; - cerr << "\tIf you have a local mirror, you can override this behavior by supplying" << endl; - cerr << "\tthe -base, -org, and -db options." << endl << endl; - cerr << "\t" << "For example, if the URL of your local mirror for mouse MM9 is called: " << endl; - cerr << "\thttp://mymirror.myuniversity.edu, then you would use the following:" << endl; - cerr << "\t" << "-base http://mymirror.myuniversity.edu" << endl; - cerr << "\t" << "-org mouse" << endl; - cerr << "\t" << "-db mm9" << endl; - - - exit(1); + cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; + + cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; + + cerr << "Summary: Creates HTML links to an UCSC Genome Browser from a feature file." << endl << endl; + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bed/gff/vcf> > out.html" << endl << endl; + + cerr << "Options: " << endl; + cerr << "\t-base\t" << "The browser basename. Default: http://genome.ucsc.edu " << endl; + cerr << "\t-org\t" << "The organism. Default: human" << endl; + cerr << "\t-db\t" << "The build. Default: hg18" << endl << endl; + + cerr << "Example: " << endl; + cerr << "\t" << "By default, the links created will point to human (hg18) UCSC browser." << endl; + cerr << "\tIf you have a local mirror, you can override this behavior by supplying" << endl; + cerr << "\tthe -base, -org, and -db options." << endl << endl; + cerr << "\t" << "For example, if the URL of your local mirror for mouse MM9 is called: " << endl; + cerr << "\thttp://mymirror.myuniversity.edu, then you would use the following:" << endl; + cerr << "\t" << "-base http://mymirror.myuniversity.edu" << endl; + cerr << "\t" << "-org mouse" << endl; + cerr << "\t" << "-db mm9" << endl; + + + exit(1); } diff --git a/src/maskFastaFromBed/maskFastaFromBed.cpp b/src/maskFastaFromBed/maskFastaFromBed.cpp index 66e3df11a7d55b7e21be1ee33f6761a13337a059..654ff6dbcd28abb1c14babc67ca241864f4a46cf 100644 --- a/src/maskFastaFromBed/maskFastaFromBed.cpp +++ b/src/maskFastaFromBed/maskFastaFromBed.cpp @@ -15,20 +15,20 @@ MaskFastaFromBed::MaskFastaFromBed(string &fastaInFile, string &bedFile, string &fastaOutFile, bool &softMask) { - _softMask = false; - if (softMask) { - _softMask = true; - } - - _fastaInFile = fastaInFile; - _bedFile = bedFile; - _fastaOutFile = fastaOutFile; - - _bed = new BedFile(_bedFile); - - _bed->loadBedFileIntoMapNoBin(); - - MaskFasta(); + _softMask = false; + if (softMask) { + _softMask = true; + } + + _fastaInFile = fastaInFile; + _bedFile = bedFile; + _fastaOutFile = fastaOutFile; + + _bed = new BedFile(_bedFile); + + _bed->loadBedFileIntoMapNoBin(); + + MaskFasta(); } @@ -41,119 +41,119 @@ MaskFastaFromBed::~MaskFastaFromBed(void) { //****************************************************************************** void MaskFastaFromBed::MaskFasta() { - /* Make sure that we can open all of the files successfully*/ - - // open the fasta database for reading - ifstream fa(_fastaInFile.c_str(), ios::in); - if ( !fa ) { - cerr << "Error: The requested fasta file (" << _fastaInFile << ") could not be opened. Exiting!" << endl; - exit (1); - } - - // open the fasta database for reading - ofstream faOut(_fastaOutFile.c_str(), ios::out); - if ( !faOut ) { - cerr << "Error: The requested fasta output file (" << _fastaOutFile << ") could not be opened. Exiting!" << endl; - exit (1); - } - - - /* Read the fastaDb chromosome by chromosome*/ - string fastaInLine; - string currChrom; - string currDNA = ""; - currDNA.reserve(500000000); - int fastaWidth = -1; - bool widthSet = false; - int start, end, length; - string replacement; - - while (getline(fa,fastaInLine)) { - - if (fastaInLine.find(">",0) != 0 ) { - if (widthSet == false) { - fastaWidth = fastaInLine.size(); - widthSet = true; - } - currDNA += fastaInLine; - } - else { - if (currDNA.size() > 0) { - - vector<BED> bedList = _bed->bedMapNoBin[currChrom]; - - /* - loop through each BED entry for this chrom and - mask the requested sequence in the FASTA file. - */ - for (unsigned int i = 0; i < bedList.size(); i++) { - start = bedList[i].start; - end = bedList[i].end; - length = end - start; - - /* - (1) if soft masking, extract the sequence, lowercase it, - then put it back - (2) otherwise replace with Ns - */ - if (_softMask) { - replacement = currDNA.substr(start, length); - toLowerCase(replacement); - currDNA.replace(start, length, replacement); - } - else { - string hardmask(length, 'N'); - currDNA.replace(start, length, hardmask); - } - } - // write the masked chrom to the output file - PrettyPrintChrom(faOut, currChrom, currDNA, fastaWidth); - } - - // reset for the next chromosome. - currChrom = fastaInLine.substr(1, fastaInLine.find_first_of(" ")-1); - currDNA = ""; - } - } - - // process the last chromosome. - // exact same logic as in the main loop. - if (currDNA.size() > 0) { - - vector<BED> bedList = _bed->bedMapNoBin[currChrom]; - - for (unsigned int i = 0; i < bedList.size(); i++) { - start = bedList[i].start; - end = bedList[i].end; - length = end - start; - - if (_softMask) { - replacement = currDNA.substr(start, length); - toLowerCase(replacement); - currDNA.replace(start, length, replacement); - } - else { - string hardmask(length, 'N'); - currDNA.replace(start, length, hardmask); - } - } - PrettyPrintChrom(faOut, currChrom, currDNA, fastaWidth); - } - - // closed for business. - fa.close(); - faOut.close(); + /* Make sure that we can open all of the files successfully*/ + + // open the fasta database for reading + ifstream fa(_fastaInFile.c_str(), ios::in); + if ( !fa ) { + cerr << "Error: The requested fasta file (" << _fastaInFile << ") could not be opened. Exiting!" << endl; + exit (1); + } + + // open the fasta database for reading + ofstream faOut(_fastaOutFile.c_str(), ios::out); + if ( !faOut ) { + cerr << "Error: The requested fasta output file (" << _fastaOutFile << ") could not be opened. Exiting!" << endl; + exit (1); + } + + + /* Read the fastaDb chromosome by chromosome*/ + string fastaInLine; + string currChrom; + string currDNA = ""; + currDNA.reserve(500000000); + int fastaWidth = -1; + bool widthSet = false; + int start, end, length; + string replacement; + + while (getline(fa,fastaInLine)) { + + if (fastaInLine.find(">",0) != 0 ) { + if (widthSet == false) { + fastaWidth = fastaInLine.size(); + widthSet = true; + } + currDNA += fastaInLine; + } + else { + if (currDNA.size() > 0) { + + vector<BED> bedList = _bed->bedMapNoBin[currChrom]; + + /* + loop through each BED entry for this chrom and + mask the requested sequence in the FASTA file. + */ + for (unsigned int i = 0; i < bedList.size(); i++) { + start = bedList[i].start; + end = bedList[i].end; + length = end - start; + + /* + (1) if soft masking, extract the sequence, lowercase it, + then put it back + (2) otherwise replace with Ns + */ + if (_softMask) { + replacement = currDNA.substr(start, length); + toLowerCase(replacement); + currDNA.replace(start, length, replacement); + } + else { + string hardmask(length, 'N'); + currDNA.replace(start, length, hardmask); + } + } + // write the masked chrom to the output file + PrettyPrintChrom(faOut, currChrom, currDNA, fastaWidth); + } + + // reset for the next chromosome. + currChrom = fastaInLine.substr(1, fastaInLine.find_first_of(" ")-1); + currDNA = ""; + } + } + + // process the last chromosome. + // exact same logic as in the main loop. + if (currDNA.size() > 0) { + + vector<BED> bedList = _bed->bedMapNoBin[currChrom]; + + for (unsigned int i = 0; i < bedList.size(); i++) { + start = bedList[i].start; + end = bedList[i].end; + length = end - start; + + if (_softMask) { + replacement = currDNA.substr(start, length); + toLowerCase(replacement); + currDNA.replace(start, length, replacement); + } + else { + string hardmask(length, 'N'); + currDNA.replace(start, length, hardmask); + } + } + PrettyPrintChrom(faOut, currChrom, currDNA, fastaWidth); + } + + // closed for business. + fa.close(); + faOut.close(); } void MaskFastaFromBed::PrettyPrintChrom(ofstream &out, string chrom, const string &sequence, int width) { - - int seqLength = sequence.size(); - - out << ">" << chrom << endl; - for(int i = 0; i < seqLength; i += width) { - if (i + width < seqLength) out << sequence.substr(i, width) << endl; - else out << sequence.substr(i, seqLength-i) << endl; - } + + int seqLength = sequence.size(); + + out << ">" << chrom << endl; + for(int i = 0; i < seqLength; i += width) { + if (i + width < seqLength) out << sequence.substr(i, width) << endl; + else out << sequence.substr(i, seqLength-i) << endl; + } } diff --git a/src/maskFastaFromBed/maskFastaFromBed.h b/src/maskFastaFromBed/maskFastaFromBed.h index bfeb383abc9deaec0490a5ffe72a79dd9c97baa0..1b7a3590b2722285ad11ae14eaaccb8dee2ed2ea 100644 --- a/src/maskFastaFromBed/maskFastaFromBed.h +++ b/src/maskFastaFromBed/maskFastaFromBed.h @@ -16,7 +16,7 @@ #include <vector> #include <iostream> #include <fstream> -#include <cctype> /* for tolower */ +#include <cctype> /* for tolower */ using namespace std; @@ -26,28 +26,28 @@ using namespace std; class MaskFastaFromBed { public: - - // constructor - MaskFastaFromBed(string &fastaInFile, string &bedFile, string &fastaOutFile, bool &softMask); - // destructor - ~MaskFastaFromBed(void); + // constructor + MaskFastaFromBed(string &fastaInFile, string &bedFile, string &fastaOutFile, bool &softMask); + + // destructor + ~MaskFastaFromBed(void); + - private: - - bool _softMask; - - string _fastaInFile; - string _bedFile; - string _fastaOutFile; - - // instance of a bed file class. - BedFile *_bed; - - void MaskFasta(); - - void PrettyPrintChrom(ofstream &out, string chrom, const string &sequence, int width); + + bool _softMask; + + string _fastaInFile; + string _bedFile; + string _fastaOutFile; + + // instance of a bed file class. + BedFile *_bed; + + void MaskFasta(); + + void PrettyPrintChrom(ofstream &out, string chrom, const string &sequence, int width); }; diff --git a/src/maskFastaFromBed/maskFastaFromBedMain.cpp b/src/maskFastaFromBed/maskFastaFromBedMain.cpp index b48dcb217a088e5616c0f3d3eef3743119de6e49..0978436d9d5e05f01dc3fe48aa2fcc5809bfa05b 100644 --- a/src/maskFastaFromBed/maskFastaFromBedMain.cpp +++ b/src/maskFastaFromBed/maskFastaFromBedMain.cpp @@ -26,106 +26,106 @@ void ShowHelp(void); int main(int argc, char* argv[]) { - // our configuration variables - bool showHelp = false; - - // input files - string fastaInFile; - string bedFile; - - // output files - string fastaOutFile; - - // checks for existence of parameters - bool haveFastaIn = false; - bool haveBed = false; - bool haveFastaOut = false; - bool softMask = false; - - // check to see if we should print out some help - if(argc <= 1) showHelp = true; - - for(int i = 1; i < argc; i++) { - int parameterLength = (int)strlen(argv[i]); - - if((PARAMETER_CHECK("-h", 2, parameterLength)) || - (PARAMETER_CHECK("--help", 5, parameterLength))) { - showHelp = true; - } - } - - if(showHelp) ShowHelp(); - - // do some parsing (all of these parameters require 2 strings) - for(int i = 1; i < argc; i++) { - - int parameterLength = (int)strlen(argv[i]); - - if(PARAMETER_CHECK("-fi", 3, parameterLength)) { - if ((i+1) < argc) { - haveFastaIn = true; - fastaInFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-fo", 3, parameterLength)) { - if ((i+1) < argc) { - haveFastaOut = true; - fastaOutFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-bed", 4, parameterLength)) { - if ((i+1) < argc) { - haveBed = true; - bedFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-soft", 5, parameterLength)) { - softMask = true; - } - else { - cerr << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; - showHelp = true; - } - } - - if (!haveFastaIn || !haveFastaOut || !haveBed) { - showHelp = true; - } - - if (!showHelp) { - - MaskFastaFromBed *maskFasta = new MaskFastaFromBed(fastaInFile, bedFile, fastaOutFile, softMask); - delete maskFasta; - return 0; - } - else { - ShowHelp(); - } + // our configuration variables + bool showHelp = false; + + // input files + string fastaInFile; + string bedFile; + + // output files + string fastaOutFile; + + // checks for existence of parameters + bool haveFastaIn = false; + bool haveBed = false; + bool haveFastaOut = false; + bool softMask = false; + + // check to see if we should print out some help + if(argc <= 1) showHelp = true; + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if((PARAMETER_CHECK("-h", 2, parameterLength)) || + (PARAMETER_CHECK("--help", 5, parameterLength))) { + showHelp = true; + } + } + + if(showHelp) ShowHelp(); + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-fi", 3, parameterLength)) { + if ((i+1) < argc) { + haveFastaIn = true; + fastaInFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-fo", 3, parameterLength)) { + if ((i+1) < argc) { + haveFastaOut = true; + fastaOutFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-bed", 4, parameterLength)) { + if ((i+1) < argc) { + haveBed = true; + bedFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-soft", 5, parameterLength)) { + softMask = true; + } + else { + cerr << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; + showHelp = true; + } + } + + if (!haveFastaIn || !haveFastaOut || !haveBed) { + showHelp = true; + } + + if (!showHelp) { + + MaskFastaFromBed *maskFasta = new MaskFastaFromBed(fastaInFile, bedFile, fastaOutFile, softMask); + delete maskFasta; + return 0; + } + else { + ShowHelp(); + } } void ShowHelp(void) { - - cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; - - cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; - - cerr << "Summary: Mask a fasta file based on feature coordinates." << endl << endl; - cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -fi <fasta> -out <fasta> -bed <bed/gff/vcf>" << endl << endl; + cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; - cerr << "Options:" << endl; - cerr << "\t-fi\tInput FASTA file" << endl; - cerr << "\t-bed\tBED/GFF/VCF file of ranges to mask in -fi" << endl; - cerr << "\t-fo\tOutput FASTA file" << endl; - cerr << "\t-soft\tEnforce \"soft\" masking. That is, instead of masking with Ns," << endl; - cerr << "\t\tmask with lower-case bases." << endl; + cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; + + cerr << "Summary: Mask a fasta file based on feature coordinates." << endl << endl; + + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -fi <fasta> -out <fasta> -bed <bed/gff/vcf>" << endl << endl; + + cerr << "Options:" << endl; + cerr << "\t-fi\tInput FASTA file" << endl; + cerr << "\t-bed\tBED/GFF/VCF file of ranges to mask in -fi" << endl; + cerr << "\t-fo\tOutput FASTA file" << endl; + cerr << "\t-soft\tEnforce \"soft\" masking. That is, instead of masking with Ns," << endl; + cerr << "\t\tmask with lower-case bases." << endl; + + // end the program here + exit(1); - // end the program here - exit(1); - } diff --git a/src/mergeBed/mergeBed.cpp b/src/mergeBed/mergeBed.cpp index acaff78c574c43aa7d6ba3088ca6d2722056f1e6..4cb56e762780a87a2a40d12ec073787226b40877 100644 --- a/src/mergeBed/mergeBed.cpp +++ b/src/mergeBed/mergeBed.cpp @@ -14,14 +14,14 @@ void ReportMergedNames(const map<string, bool> &names) { - unsigned int n = 0; - map<string, bool>::const_iterator nameItr = names.begin(); - map<string, bool>::const_iterator nameEnd = names.end(); - for (; nameItr != nameEnd; ++nameItr) { - if (n < (names.size() - 1)) {cout << nameItr->first << ";";} - else {cout << nameItr->first;} - n++; - } + unsigned int n = 0; + map<string, bool>::const_iterator nameItr = names.begin(); + map<string, bool>::const_iterator nameEnd = names.end(); + for (; nameItr != nameEnd; ++nameItr) { + if (n < (names.size() - 1)) {cout << nameItr->first << ";";} + else {cout << nameItr->first;} + n++; + } } // =============== @@ -29,18 +29,18 @@ void ReportMergedNames(const map<string, bool> &names) { // =============== BedMerge::BedMerge(string &bedFile, bool &numEntries, int &maxDistance, bool &forceStrand, bool &reportNames) { - _bedFile = bedFile; - _numEntries = numEntries; - _maxDistance = -1 * maxDistance; - _forceStrand = forceStrand; - _reportNames = reportNames; - - _bed = new BedFile(bedFile); - - if (_forceStrand == false) - MergeBed(); - else - MergeBedStranded(); + _bedFile = bedFile; + _numEntries = numEntries; + _maxDistance = -1 * maxDistance; + _forceStrand = forceStrand; + _reportNames = reportNames; + + _bed = new BedFile(bedFile); + + if (_forceStrand == false) + MergeBed(); + else + MergeBedStranded(); } @@ -56,117 +56,117 @@ BedMerge::~BedMerge(void) { // ===================================================== void BedMerge::MergeBed() { - // load the "B" bed file into a map so - // that we can easily compare "A" to it for overlaps - _bed->loadBedFileIntoMapNoBin(); - - // loop through each chromosome and merge their BED entries - for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) { - - // bedList is already sorted by start position. - vector<BED> bedList = m->second; - - CHRPOS minStart = INT_MAX; - CHRPOS maxEnd = 0; - bool OIP = false; // OIP = Overlap In Progress. Lame, I realize. - int prev = -1; - unsigned int curr = 0; - int mergeCount = 1; - map<string, bool> names; - - // loop through the BED entries for this chromosome - // and look for overlaps - for (curr = 0; curr < bedList.size(); ++curr) { - - // make sure prev points to an actual element - if (prev < 0) { - prev = curr; - continue; - } - - // Is there an overlap between the current and previous entries? - if ( overlaps(bedList[prev].start, bedList[prev].end, - bedList[curr].start, bedList[curr].end) >= _maxDistance) { - OIP = true; - mergeCount++; - minStart = min(bedList[prev].start, min(minStart, bedList[curr].start)); - maxEnd = max(bedList[prev].end, max(maxEnd, bedList[curr].end)); - - names[bedList[prev].name] = true; - names[bedList[curr].name] = true; - } - else if ( overlaps(minStart, maxEnd, - bedList[curr].start, bedList[curr].end) >= _maxDistance) { - mergeCount++; - minStart = min(minStart, bedList[curr].start); - maxEnd = max(maxEnd, bedList[curr].end); - names[bedList[curr].name] = true; - } - else { - // was there an overlap befor the current entry broke it? - if (OIP) { - if (_numEntries) { - cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << mergeCount << endl; - } - else if (_reportNames) { - cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t"; - ReportMergedNames(names); - cout << endl; - } - else { - cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << endl; - } - } - else { - if (_numEntries) { - cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << mergeCount << endl; - } - else if (_reportNames) { - cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << bedList[prev].name << endl; - } - else { - cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << endl; - } - } - - // reset things for the next overlapping "block" - OIP = false; - mergeCount = 1; - minStart = INT_MAX; - maxEnd = 0; - - names.clear(); - names[bedList[curr].name] = true; - } - prev = curr; - } - - // clean up based on the last entry for the current chromosome - if (OIP) { - if (_numEntries) { - cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << mergeCount << endl; - } - else if (_reportNames) { - cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t"; - ReportMergedNames(names); - cout << endl; - } - else { - cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << endl; - } - } - else { - if (_numEntries) { - cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << mergeCount << endl; - } - else if (_reportNames) { - cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << bedList[prev].name << endl; - } - else { - cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << endl; - } - } - } + // load the "B" bed file into a map so + // that we can easily compare "A" to it for overlaps + _bed->loadBedFileIntoMapNoBin(); + + // loop through each chromosome and merge their BED entries + for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) { + + // bedList is already sorted by start position. + vector<BED> bedList = m->second; + + CHRPOS minStart = INT_MAX; + CHRPOS maxEnd = 0; + bool OIP = false; // OIP = Overlap In Progress. Lame, I realize. + int prev = -1; + unsigned int curr = 0; + int mergeCount = 1; + map<string, bool> names; + + // loop through the BED entries for this chromosome + // and look for overlaps + for (curr = 0; curr < bedList.size(); ++curr) { + + // make sure prev points to an actual element + if (prev < 0) { + prev = curr; + continue; + } + + // Is there an overlap between the current and previous entries? + if ( overlaps(bedList[prev].start, bedList[prev].end, + bedList[curr].start, bedList[curr].end) >= _maxDistance) { + OIP = true; + mergeCount++; + minStart = min(bedList[prev].start, min(minStart, bedList[curr].start)); + maxEnd = max(bedList[prev].end, max(maxEnd, bedList[curr].end)); + + names[bedList[prev].name] = true; + names[bedList[curr].name] = true; + } + else if ( overlaps(minStart, maxEnd, + bedList[curr].start, bedList[curr].end) >= _maxDistance) { + mergeCount++; + minStart = min(minStart, bedList[curr].start); + maxEnd = max(maxEnd, bedList[curr].end); + names[bedList[curr].name] = true; + } + else { + // was there an overlap befor the current entry broke it? + if (OIP) { + if (_numEntries) { + cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << mergeCount << endl; + } + else if (_reportNames) { + cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t"; + ReportMergedNames(names); + cout << endl; + } + else { + cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << endl; + } + } + else { + if (_numEntries) { + cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << mergeCount << endl; + } + else if (_reportNames) { + cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << bedList[prev].name << endl; + } + else { + cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << endl; + } + } + + // reset things for the next overlapping "block" + OIP = false; + mergeCount = 1; + minStart = INT_MAX; + maxEnd = 0; + + names.clear(); + names[bedList[curr].name] = true; + } + prev = curr; + } + + // clean up based on the last entry for the current chromosome + if (OIP) { + if (_numEntries) { + cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << mergeCount << endl; + } + else if (_reportNames) { + cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t"; + ReportMergedNames(names); + cout << endl; + } + else { + cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << endl; + } + } + else { + if (_numEntries) { + cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << mergeCount << endl; + } + else if (_reportNames) { + cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << bedList[prev].name << endl; + } + else { + cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << endl; + } + } + } } @@ -175,140 +175,140 @@ void BedMerge::MergeBed() { // ================================================================================== void BedMerge::MergeBedStranded() { - // load the "B" bed file into a map so - // that we can easily compare "A" to it for overlaps - _bed->loadBedFileIntoMapNoBin(); + // load the "B" bed file into a map so + // that we can easily compare "A" to it for overlaps + _bed->loadBedFileIntoMapNoBin(); - // loop through each chromosome and merge their BED entries - masterBedMapNoBin::const_iterator m = _bed->bedMapNoBin.begin(); - masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end(); + // loop through each chromosome and merge their BED entries + masterBedMapNoBin::const_iterator m = _bed->bedMapNoBin.begin(); + masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end(); for (; m != mEnd; ++m) { - // bedList is already sorted by start position. - vector<BED> bedList = m->second; - - // make a list of the two strands to merge separately. - vector<string> strands(2); - strands[0] = "+"; - strands[1] = "-"; - - // do two passes, one for each strand. - for (unsigned int s = 0; s < strands.size(); s++) { - - CHRPOS minStart = INT_MAX; - CHRPOS maxEnd = 0; - bool OIP = false; // OIP = Overlap In Progress. Lame, I realize. - int prev = -1; - unsigned int curr = 0; - int mergeCount = 1; - int numOnStrand = 0; - map<string, bool> names; - - // loop through the BED entries for this chromosome - // and look for overlaps - for (curr = 0; curr < bedList.size(); ++curr) { - - // if forcing strandedness, move on if the hit - // is not on the current strand. - - if (bedList[curr].strand != strands[s]) { - continue; // continue force the next iteration of the for loop. - } - else { - numOnStrand++; - } - - // make sure prev points to an actual element on the - // current strand - if (prev < 0) { - if (bedList[curr].strand == strands[s]) { - prev = curr; - } - continue; - } - - if ( overlaps(bedList[prev].start, bedList[prev].end, - bedList[curr].start, bedList[curr].end) >= _maxDistance) { - OIP = true; - mergeCount++; - minStart = min(bedList[prev].start, min(minStart, bedList[curr].start)); - maxEnd = max(bedList[prev].end, max(maxEnd, bedList[curr].end)); - - names[bedList[prev].name] = true; - names[bedList[curr].name] = true; - } - else if ( overlaps(minStart, maxEnd, - bedList[curr].start, bedList[curr].end) >= _maxDistance) { - mergeCount++; - minStart = min(minStart, bedList[curr].start); - maxEnd = max(maxEnd, bedList[curr].end); - names[bedList[curr].name] = true; - } - else { - - // was there an overlap before the current entry broke it? - if (OIP) { - if (_numEntries) { - cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << mergeCount << "\t" << strands[s] << endl; - } - else if (_reportNames) { - cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t"; - ReportMergedNames(names); - cout << "\t" << strands[s] << endl; - } - else { - cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << strands[s] << endl; - } - } - else { - if ((_numEntries) && (numOnStrand > 0)) { - cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << mergeCount << "\t" << strands[s] << endl; - } - else if (_reportNames) { - cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << bedList[prev].name << "\t" << strands[s] << endl; - } - else if (numOnStrand > 0) { - cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << strands[s] << endl; - } - } - - // reset things for the next overlapping "block" - OIP = false; - mergeCount = 1; - minStart = INT_MAX; - maxEnd = 0; - names.clear(); - - // add the name of the current element in prep for the next block - names[bedList[curr].name] = true; - } - prev = curr; - } - - // clean up based on the last entry for the current chromosome - if (OIP) { - if (_numEntries) { - cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << mergeCount << "\t" << strands[s] << endl; - } - else if (_reportNames) { - cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t"; - ReportMergedNames(names); - cout << "\t" << strands[s] << endl; - } - else { - cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << strands[s] << endl; - } - } - else { - if ((_numEntries) && (numOnStrand > 0)) { - cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << mergeCount << "\t" << strands[s] << endl; - } - else if ((_reportNames) && (numOnStrand > 0)) { - cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << bedList[prev].name << "\t" << strands[s] << endl; - } - else if (numOnStrand > 0) { - cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << strands[s] << endl; - } - } - } - } + // bedList is already sorted by start position. + vector<BED> bedList = m->second; + + // make a list of the two strands to merge separately. + vector<string> strands(2); + strands[0] = "+"; + strands[1] = "-"; + + // do two passes, one for each strand. + for (unsigned int s = 0; s < strands.size(); s++) { + + CHRPOS minStart = INT_MAX; + CHRPOS maxEnd = 0; + bool OIP = false; // OIP = Overlap In Progress. Lame, I realize. + int prev = -1; + unsigned int curr = 0; + int mergeCount = 1; + int numOnStrand = 0; + map<string, bool> names; + + // loop through the BED entries for this chromosome + // and look for overlaps + for (curr = 0; curr < bedList.size(); ++curr) { + + // if forcing strandedness, move on if the hit + // is not on the current strand. + + if (bedList[curr].strand != strands[s]) { + continue; // continue force the next iteration of the for loop. + } + else { + numOnStrand++; + } + + // make sure prev points to an actual element on the + // current strand + if (prev < 0) { + if (bedList[curr].strand == strands[s]) { + prev = curr; + } + continue; + } + + if ( overlaps(bedList[prev].start, bedList[prev].end, + bedList[curr].start, bedList[curr].end) >= _maxDistance) { + OIP = true; + mergeCount++; + minStart = min(bedList[prev].start, min(minStart, bedList[curr].start)); + maxEnd = max(bedList[prev].end, max(maxEnd, bedList[curr].end)); + + names[bedList[prev].name] = true; + names[bedList[curr].name] = true; + } + else if ( overlaps(minStart, maxEnd, + bedList[curr].start, bedList[curr].end) >= _maxDistance) { + mergeCount++; + minStart = min(minStart, bedList[curr].start); + maxEnd = max(maxEnd, bedList[curr].end); + names[bedList[curr].name] = true; + } + else { + + // was there an overlap before the current entry broke it? + if (OIP) { + if (_numEntries) { + cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << mergeCount << "\t" << strands[s] << endl; + } + else if (_reportNames) { + cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t"; + ReportMergedNames(names); + cout << "\t" << strands[s] << endl; + } + else { + cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << strands[s] << endl; + } + } + else { + if ((_numEntries) && (numOnStrand > 0)) { + cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << mergeCount << "\t" << strands[s] << endl; + } + else if (_reportNames) { + cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << bedList[prev].name << "\t" << strands[s] << endl; + } + else if (numOnStrand > 0) { + cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << strands[s] << endl; + } + } + + // reset things for the next overlapping "block" + OIP = false; + mergeCount = 1; + minStart = INT_MAX; + maxEnd = 0; + names.clear(); + + // add the name of the current element in prep for the next block + names[bedList[curr].name] = true; + } + prev = curr; + } + + // clean up based on the last entry for the current chromosome + if (OIP) { + if (_numEntries) { + cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << mergeCount << "\t" << strands[s] << endl; + } + else if (_reportNames) { + cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t"; + ReportMergedNames(names); + cout << "\t" << strands[s] << endl; + } + else { + cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << strands[s] << endl; + } + } + else { + if ((_numEntries) && (numOnStrand > 0)) { + cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << mergeCount << "\t" << strands[s] << endl; + } + else if ((_reportNames) && (numOnStrand > 0)) { + cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << bedList[prev].name << "\t" << strands[s] << endl; + } + else if (numOnStrand > 0) { + cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << strands[s] << endl; + } + } + } + } } diff --git a/src/mergeBed/mergeBed.h b/src/mergeBed/mergeBed.h index 4567865a581e10a4f90bb06531afb297486c0ec9..37b721565bd39a87a62542f117aea9561e2a9c66 100644 --- a/src/mergeBed/mergeBed.h +++ b/src/mergeBed/mergeBed.h @@ -28,7 +28,7 @@ class BedMerge { public: - // constructor + // constructor BedMerge(string &bedFile, bool &numEntries, int &maxDistance, bool &forceStrand, bool &reportNames); // destructor @@ -38,13 +38,13 @@ public: void MergeBedStranded(); private: - - string _bedFile; - bool _numEntries; - bool _forceStrand; - bool _reportNames; - int _maxDistance; - // instance of a bed file class. - BedFile *_bed; + + string _bedFile; + bool _numEntries; + bool _forceStrand; + bool _reportNames; + int _maxDistance; + // instance of a bed file class. + BedFile *_bed; }; diff --git a/src/mergeBed/mergeMain.cpp b/src/mergeBed/mergeMain.cpp index e730d627d67087c710d18da569bb4eb4f9ca2ddb..c756e38a09e50675f3dcad982fe137c2b03d121d 100644 --- a/src/mergeBed/mergeMain.cpp +++ b/src/mergeBed/mergeMain.cpp @@ -26,112 +26,112 @@ void ShowHelp(void); int main(int argc, char* argv[]) { - // our configuration variables - bool showHelp = false; - - // input files - string bedFile = "stdin"; - int maxDistance = 0; - - // input arguments - bool haveBed = true; - bool numEntries = false; - bool haveMaxDistance = false; - bool forceStrand = false; - bool reportNames = false; - - for(int i = 1; i < argc; i++) { - int parameterLength = (int)strlen(argv[i]); - - if((PARAMETER_CHECK("-h", 2, parameterLength)) || - (PARAMETER_CHECK("--help", 5, parameterLength))) { - showHelp = true; - } - } - - if(showHelp) ShowHelp(); - - // do some parsing (all of these parameters require 2 strings) - for(int i = 1; i < argc; i++) { - - int parameterLength = (int)strlen(argv[i]); - - if(PARAMETER_CHECK("-i", 2, parameterLength)) { - if ((i+1) < argc) { - bedFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-n", 2, parameterLength)) { - numEntries = true; - } - else if(PARAMETER_CHECK("-d", 2, parameterLength)) { - if ((i+1) < argc) { - haveMaxDistance = true; - maxDistance = atoi(argv[i + 1]); - i++; - } - } - else if (PARAMETER_CHECK("-s", 2, parameterLength)) { - forceStrand = true; - } - else if (PARAMETER_CHECK("-nms", 4, parameterLength)) { - reportNames = true; - } - else { - cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; - showHelp = true; - } - } - - // make sure we have both input files - if (!haveBed) { - cerr << endl << "*****" << endl << "*****ERROR: Need -i BED file. " << endl << "*****" << endl; - showHelp = true; - } - if (reportNames && numEntries) { - cerr << endl << "*****" << endl << "*****ERROR: Request either -n OR -nms, not both." << endl << "*****" << endl; - showHelp = true; - } - - if (!showHelp) { - BedMerge *bm = new BedMerge(bedFile, numEntries, maxDistance, forceStrand, reportNames); - delete bm; - return 0; - } - else { - ShowHelp(); - } + // our configuration variables + bool showHelp = false; + + // input files + string bedFile = "stdin"; + int maxDistance = 0; + + // input arguments + bool haveBed = true; + bool numEntries = false; + bool haveMaxDistance = false; + bool forceStrand = false; + bool reportNames = false; + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if((PARAMETER_CHECK("-h", 2, parameterLength)) || + (PARAMETER_CHECK("--help", 5, parameterLength))) { + showHelp = true; + } + } + + if(showHelp) ShowHelp(); + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-i", 2, parameterLength)) { + if ((i+1) < argc) { + bedFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-n", 2, parameterLength)) { + numEntries = true; + } + else if(PARAMETER_CHECK("-d", 2, parameterLength)) { + if ((i+1) < argc) { + haveMaxDistance = true; + maxDistance = atoi(argv[i + 1]); + i++; + } + } + else if (PARAMETER_CHECK("-s", 2, parameterLength)) { + forceStrand = true; + } + else if (PARAMETER_CHECK("-nms", 4, parameterLength)) { + reportNames = true; + } + else { + cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; + showHelp = true; + } + } + + // make sure we have both input files + if (!haveBed) { + cerr << endl << "*****" << endl << "*****ERROR: Need -i BED file. " << endl << "*****" << endl; + showHelp = true; + } + if (reportNames && numEntries) { + cerr << endl << "*****" << endl << "*****ERROR: Request either -n OR -nms, not both." << endl << "*****" << endl; + showHelp = true; + } + + if (!showHelp) { + BedMerge *bm = new BedMerge(bedFile, numEntries, maxDistance, forceStrand, reportNames); + delete bm; + return 0; + } + else { + ShowHelp(); + } } void ShowHelp(void) { - cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; - - cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; - - cerr << "Summary: Merges overlapping BED/GFF/VCF entries into a single interval." << endl << endl; + cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; - cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bed/gff/vcf>" << endl << endl; + cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; - cerr << "Options: " << endl; - cerr << "\t-s\t" << "Force strandedness. That is, only merge features" << endl; - cerr << "\t\tthat are the same strand." << endl; - cerr << "\t\t- By default, merging is done without respect to strand." << endl << endl; + cerr << "Summary: Merges overlapping BED/GFF/VCF entries into a single interval." << endl << endl; - cerr << "\t-n\t" << "Report the number of BED entries that were merged." << endl; - cerr << "\t\t- Note: \"1\" is reported if no merging occurred." << endl << endl; + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bed/gff/vcf>" << endl << endl; + cerr << "Options: " << endl; + cerr << "\t-s\t" << "Force strandedness. That is, only merge features" << endl; + cerr << "\t\tthat are the same strand." << endl; + cerr << "\t\t- By default, merging is done without respect to strand." << endl << endl; - cerr << "\t-d\t" << "Maximum distance between features allowed for features" << endl; - cerr << "\t\tto be merged." << endl; - cerr << "\t\t- Def. 0. That is, overlapping & book-ended features are merged." << endl; - cerr << "\t\t- (INTEGER)" << endl << endl; - - cerr << "\t-nms\t" << "Report the names of the merged features separated by semicolons." << endl << endl; - + cerr << "\t-n\t" << "Report the number of BED entries that were merged." << endl; + cerr << "\t\t- Note: \"1\" is reported if no merging occurred." << endl << endl; - // end the program here - exit(1); + + cerr << "\t-d\t" << "Maximum distance between features allowed for features" << endl; + cerr << "\t\tto be merged." << endl; + cerr << "\t\t- Def. 0. That is, overlapping & book-ended features are merged." << endl; + cerr << "\t\t- (INTEGER)" << endl << endl; + + cerr << "\t-nms\t" << "Report the names of the merged features separated by semicolons." << endl << endl; + + + // end the program here + exit(1); } diff --git a/src/overlap/overlap.cpp b/src/overlap/overlap.cpp index c7024f63545aa1c8c8e35cd00d4dd0caa12492f0..63ef656911ef9ae238217058b0bd4020e66bbaba 100644 --- a/src/overlap/overlap.cpp +++ b/src/overlap/overlap.cpp @@ -34,169 +34,169 @@ void ComputeOverlaps(istream &input, short &s1Col, short &e1Col, short &s2Col, s int main(int argc, char* argv[]) { - // input files - string inFile = "stdin"; - string columns; - - // our configuration variables - bool showHelp = false; - bool haveInFile = true; - bool haveColumns = false; - - - for(int i = 1; i < argc; i++) { - int parameterLength = (int)strlen(argv[i]); - - if((PARAMETER_CHECK("-h", 2, parameterLength)) || - (PARAMETER_CHECK("--help", 5, parameterLength))) { - showHelp = true; - } - } - - if(showHelp) ShowHelp(); - - // do some parsing (all of these parameters require 2 strings) - for(int i = 1; i < argc; i++) { - - int parameterLength = (int)strlen(argv[i]); - - if(PARAMETER_CHECK("-i", 2, parameterLength)) { - if ((i+1) < argc) { - inFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-cols", 5, parameterLength)) { - haveColumns = true; - columns = argv[i + 1]; - i++; - } - else { - cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; - showHelp = true; - } - } - - // make sure we have an input files - if (!haveInFile ) { - cerr << endl << "*****" << endl << "*****ERROR: Need -i file. " << endl << "*****" << endl; - showHelp = true; - } - - if (!showHelp) { - - // Split the column string sent by the user into discrete column numbers - // A comma separated string is expected. - vector<string> posColumns; - Tokenize(columns, posColumns, ","); - - if (posColumns.size() != 4) { - cerr << endl << "*****" << endl << "*****ERROR: Please specify 4, comma-separated position columns. " << endl << "*****" << endl; - ShowHelp(); - } - else { - short s1, e1, s2, e2; - s1 = atoi(posColumns[0].c_str()); - e1 = atoi(posColumns[1].c_str()); - s2 = atoi(posColumns[2].c_str()); - e2 = atoi(posColumns[3].c_str()); - - DetermineInput(inFile, s1, e1, s2, e2); - } - } - else { - ShowHelp(); - } + // input files + string inFile = "stdin"; + string columns; + + // our configuration variables + bool showHelp = false; + bool haveInFile = true; + bool haveColumns = false; + + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if((PARAMETER_CHECK("-h", 2, parameterLength)) || + (PARAMETER_CHECK("--help", 5, parameterLength))) { + showHelp = true; + } + } + + if(showHelp) ShowHelp(); + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-i", 2, parameterLength)) { + if ((i+1) < argc) { + inFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-cols", 5, parameterLength)) { + haveColumns = true; + columns = argv[i + 1]; + i++; + } + else { + cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; + showHelp = true; + } + } + + // make sure we have an input files + if (!haveInFile ) { + cerr << endl << "*****" << endl << "*****ERROR: Need -i file. " << endl << "*****" << endl; + showHelp = true; + } + + if (!showHelp) { + + // Split the column string sent by the user into discrete column numbers + // A comma separated string is expected. + vector<string> posColumns; + Tokenize(columns, posColumns, ","); + + if (posColumns.size() != 4) { + cerr << endl << "*****" << endl << "*****ERROR: Please specify 4, comma-separated position columns. " << endl << "*****" << endl; + ShowHelp(); + } + else { + short s1, e1, s2, e2; + s1 = atoi(posColumns[0].c_str()); + e1 = atoi(posColumns[1].c_str()); + s2 = atoi(posColumns[2].c_str()); + e2 = atoi(posColumns[3].c_str()); + + DetermineInput(inFile, s1, e1, s2, e2); + } + } + else { + ShowHelp(); + } } void ShowHelp(void) { - cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; - - cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; - - cerr << "Summary: Computes the amount of overlap (positive values)" << endl; - cerr << "\t or distance (negative values) between genome features" << endl; - cerr << "\t and reports the result at the end of the same line." << endl << endl; - - cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <input> -cols s1,e1,s2,e2 " << endl << endl; - - cerr << "Options: " << endl; - cerr << "\t-i\t" << "Input file. Use \"stdin\" for pipes." << endl << endl; - - cerr << "\t-cols\t" << "Specify the columns (1-based) for the starts and ends of the" << endl; - cerr << "\t\tfeatures for which you'd like to compute the overlap/distance." << endl; - cerr << "\t\tThe columns must be listed in the following order: " << endl << endl; - cerr << "\t\tstart1,end1,start2,end2" << endl << endl; - - cerr << "Example: " << endl; - cerr << "\t$ windowBed -a A.bed -b B.bed -w 10" << endl; - cerr << "\tchr1 10 20 A chr1 15 25 B" << endl; - cerr << "\tchr1 10 20 C chr1 25 35 D" << endl << endl; - cerr << "\t$ windowBed -a A.bed -b B.bed -w 10 | overlap -i stdin -cols 2,3,6,7" << endl; - cerr << "\tchr1 10 20 A chr1 15 25 B 5" << endl; - cerr << "\tchr1 10 20 C chr1 25 35 D -5" << endl; - - // end the program here - exit(1); + cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; + + cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; + + cerr << "Summary: Computes the amount of overlap (positive values)" << endl; + cerr << "\t or distance (negative values) between genome features" << endl; + cerr << "\t and reports the result at the end of the same line." << endl << endl; + + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <input> -cols s1,e1,s2,e2 " << endl << endl; + + cerr << "Options: " << endl; + cerr << "\t-i\t" << "Input file. Use \"stdin\" for pipes." << endl << endl; + + cerr << "\t-cols\t" << "Specify the columns (1-based) for the starts and ends of the" << endl; + cerr << "\t\tfeatures for which you'd like to compute the overlap/distance." << endl; + cerr << "\t\tThe columns must be listed in the following order: " << endl << endl; + cerr << "\t\tstart1,end1,start2,end2" << endl << endl; + + cerr << "Example: " << endl; + cerr << "\t$ windowBed -a A.bed -b B.bed -w 10" << endl; + cerr << "\tchr1 10 20 A chr1 15 25 B" << endl; + cerr << "\tchr1 10 20 C chr1 25 35 D" << endl << endl; + cerr << "\t$ windowBed -a A.bed -b B.bed -w 10 | overlap -i stdin -cols 2,3,6,7" << endl; + cerr << "\tchr1 10 20 A chr1 15 25 B 5" << endl; + cerr << "\tchr1 10 20 C chr1 25 35 D -5" << endl; + + // end the program here + exit(1); } void DetermineInput(string &inFile, short &s1Col, short &e1Col, short &s2Col, short &e2Col) { - - - if (inFile != "stdin") { // process a file - - ifstream in(inFile.c_str(), ios::in); - if ( !in ) { - cerr << "Error: The requested input file (" << inFile << ") could not be opened. Exiting!" << endl; - exit (1); - } - ComputeOverlaps(in, s1Col, e1Col, s2Col, e2Col); - } - else ComputeOverlaps(cin, s1Col, e1Col, s2Col, e2Col); + + + if (inFile != "stdin") { // process a file + + ifstream in(inFile.c_str(), ios::in); + if ( !in ) { + cerr << "Error: The requested input file (" << inFile << ") could not be opened. Exiting!" << endl; + exit (1); + } + ComputeOverlaps(in, s1Col, e1Col, s2Col, e2Col); + } + else ComputeOverlaps(cin, s1Col, e1Col, s2Col, e2Col); } - + void ComputeOverlaps(istream &input, short &s1Col, short &e1Col, short &s2Col, short &e2Col) { - - int lineNum = 0; - string inLine; - vector<string> inFields; - - int overlap; - - char *s1End, *e1End, *s2End, *e2End; - long s1, e1, s2, e2; - - while (getline(input, inLine)) { - lineNum++; - Tokenize(inLine, inFields); - - if (inFields.size() > 1) { - - // test if columns 2 and 3 are integers. If so, assume BED. - s1 = strtol(inFields[s1Col-1].c_str(), &s1End, 10); - e1 = strtol(inFields[e1Col-1].c_str(), &e1End, 10); - s2 = strtol(inFields[s2Col-1].c_str(), &s2End, 10); - e2 = strtol(inFields[e2Col-1].c_str(), &e2End, 10); - - // strtol will set pointers to the start of the string if non-integral, base 10 - // if they all check out, we have valid numeric columns. Otherwise, complain. - if (s1End != inFields[s1Col-1].c_str() && - e1End != inFields[e1Col-1].c_str() && - s2End != inFields[s2Col-1].c_str() && - e2End != inFields[e2Col-1].c_str()) { - - overlap = overlaps(s1, e1, s2, e2); - printf("%s\t%d\n", inLine.c_str(), overlap); - } - else { - cerr << "One of your columns appears to be non-numeric at line " << lineNum << ". Exiting..." << endl << endl; - exit(1); - } - } - inFields.clear(); - } + + int lineNum = 0; + string inLine; + vector<string> inFields; + + int overlap; + + char *s1End, *e1End, *s2End, *e2End; + long s1, e1, s2, e2; + + while (getline(input, inLine)) { + lineNum++; + Tokenize(inLine, inFields); + + if (inFields.size() > 1) { + + // test if columns 2 and 3 are integers. If so, assume BED. + s1 = strtol(inFields[s1Col-1].c_str(), &s1End, 10); + e1 = strtol(inFields[e1Col-1].c_str(), &e1End, 10); + s2 = strtol(inFields[s2Col-1].c_str(), &s2End, 10); + e2 = strtol(inFields[e2Col-1].c_str(), &e2End, 10); + + // strtol will set pointers to the start of the string if non-integral, base 10 + // if they all check out, we have valid numeric columns. Otherwise, complain. + if (s1End != inFields[s1Col-1].c_str() && + e1End != inFields[e1Col-1].c_str() && + s2End != inFields[s2Col-1].c_str() && + e2End != inFields[e2Col-1].c_str()) { + + overlap = overlaps(s1, e1, s2, e2); + printf("%s\t%d\n", inLine.c_str(), overlap); + } + else { + cerr << "One of your columns appears to be non-numeric at line " << lineNum << ". Exiting..." << endl << endl; + exit(1); + } + } + inFields.clear(); + } } diff --git a/src/pairToBed/pairToBed.cpp b/src/pairToBed/pairToBed.cpp index dcb1245f765234c5ad6cb4cdea32f03a9762aaf8..16227a85f5ef645424c1dea550c3a215d5444b75 100644 --- a/src/pairToBed/pairToBed.cpp +++ b/src/pairToBed/pairToBed.cpp @@ -15,49 +15,49 @@ bool IsCorrectMappingForBEDPE (const BamAlignment &bam) { - if ( (bam.RefID == bam.MateRefID) && (bam.InsertSize > 0) ) { - return true; - } - else if ( (bam.RefID == bam.MateRefID) && (bam.InsertSize == 0) && bam.IsFirstMate() ) { - return true; - } - else if ( (bam.RefID != bam.MateRefID) && bam.IsFirstMate() ) { - return true; - } - else return false; + if ( (bam.RefID == bam.MateRefID) && (bam.InsertSize > 0) ) { + return true; + } + else if ( (bam.RefID == bam.MateRefID) && (bam.InsertSize == 0) && bam.IsFirstMate() ) { + return true; + } + else if ( (bam.RefID != bam.MateRefID) && bam.IsFirstMate() ) { + return true; + } + else return false; } /* - Constructor + Constructor */ -BedIntersectPE::BedIntersectPE(string bedAFilePE, string bedBFile, float overlapFraction, - string searchType, bool forceStrand, bool bamInput, - bool bamOutput, bool uncompressedBam, bool useEditDistance) { - - _bedAFilePE = bedAFilePE; - _bedBFile = bedBFile; - _overlapFraction = overlapFraction; - _forceStrand = forceStrand; - _useEditDistance = useEditDistance; - _searchType = searchType; - _bamInput = bamInput; - _bamOutput = bamOutput; +BedIntersectPE::BedIntersectPE(string bedAFilePE, string bedBFile, float overlapFraction, + string searchType, bool forceStrand, bool bamInput, + bool bamOutput, bool uncompressedBam, bool useEditDistance) { + + _bedAFilePE = bedAFilePE; + _bedBFile = bedBFile; + _overlapFraction = overlapFraction; + _forceStrand = forceStrand; + _useEditDistance = useEditDistance; + _searchType = searchType; + _bamInput = bamInput; + _bamOutput = bamOutput; _isUncompressedBam = uncompressedBam; - - _bedA = new BedFilePE(bedAFilePE); - _bedB = new BedFile(bedBFile); - - if (_bamInput == false) - IntersectBedPE(); - else - IntersectBamPE(_bedAFilePE); + + _bedA = new BedFilePE(bedAFilePE); + _bedB = new BedFile(bedBFile); + + if (_bamInput == false) + IntersectBedPE(); + else + IntersectBamPE(_bedAFilePE); } /* - Destructor + Destructor */ BedIntersectPE::~BedIntersectPE(void) { @@ -67,453 +67,453 @@ BedIntersectPE::~BedIntersectPE(void) { void BedIntersectPE::FindOverlaps(const BEDPE &a, vector<BED> &hits1, vector<BED> &hits2, const string &type) { - // list of hits on each end of BEDPE - // that exceed the requested overlap fraction - vector<BED> qualityHits1; - vector<BED> qualityHits2; - - // count of hits on each end of BEDPE - // that exceed the requested overlap fraction - int numOverlapsEnd1 = 0; - int numOverlapsEnd2 = 0; - - // make sure we have a valid chromosome before we search - if (a.chrom1 != ".") { - // Find the quality hits between ***end1*** of the BEDPE and the B BED file - _bedB->FindOverlapsPerBin(a.chrom1, a.start1, a.end1, a.strand1, hits1, _forceStrand); - - vector<BED>::const_iterator h = hits1.begin(); - vector<BED>::const_iterator hitsEnd = hits1.end(); - for (; h != hitsEnd; ++h) { - - int s = max(a.start1, h->start); - int e = min(a.end1, h->end); - int overlapBases = (e - s); // the number of overlapping bases b/w a and b - int aLength = (a.end1 - a.start1); // the length of a in b.p. - - // is there enough overlap relative to the user's request? (default ~ 1bp) - if ( ( (float) overlapBases / (float) aLength ) >= _overlapFraction ) { - numOverlapsEnd1++; - - if (type == "either") { - _bedA->reportBedPETab(a); - _bedB->reportBedNewLine(*h); - } - else { - qualityHits1.push_back(*h); - } - } - } - } - - - // make sure we have a valid chromosome before we search - if (a.chrom2 != ".") { - // Now find the quality hits between ***end2*** of the BEDPE and the B BED file - _bedB->FindOverlapsPerBin(a.chrom2, a.start2, a.end2, a.strand2, hits2, _forceStrand); - - vector<BED>::const_iterator h = hits2.begin(); - vector<BED>::const_iterator hitsEnd = hits2.end(); - for (; h != hitsEnd; ++h) { - - int s = max(a.start2, h->start); - int e = min(a.end2, h->end); - int overlapBases = (e - s); // the number of overlapping bases b/w a and b - int aLength = (a.end2 - a.start2); // the length of a in b.p. - - // is there enough overlap relative to the user's request? (default ~ 1bp) - if ( ( (float) overlapBases / (float) aLength ) >= _overlapFraction ) { - numOverlapsEnd2++; - - if (type == "either") { - _bedA->reportBedPETab(a); - _bedB->reportBedNewLine(*h); - } - else { - qualityHits2.push_back(*h); - } - } - } - } - - // Now report the hits depending on what the user has requested. - if (type == "neither") { - if ( (numOverlapsEnd1 == 0) && (numOverlapsEnd2 == 0) ) { - _bedA->reportBedPENewLine(a); - } - } - else if (type == "notboth") { - if ( (numOverlapsEnd1 == 0) && (numOverlapsEnd2 == 0) ) { - _bedA->reportBedPENewLine(a); - } - else if ( (numOverlapsEnd1 > 0) && (numOverlapsEnd2 == 0) ) { - for (vector<BED>::iterator q = qualityHits1.begin(); q != qualityHits1.end(); ++q) { - _bedA->reportBedPETab(a); - _bedB->reportBedNewLine(*q); - } - } - else if ( (numOverlapsEnd1 == 0) && (numOverlapsEnd2 > 0) ) { - for (vector<BED>::iterator q = qualityHits2.begin(); q != qualityHits2.end(); ++q) { - _bedA->reportBedPETab(a); - _bedB->reportBedNewLine(*q); - } - } - } - else if (type == "xor") { - if ( (numOverlapsEnd1 > 0) && (numOverlapsEnd2 == 0) ) { - for (vector<BED>::iterator q = qualityHits1.begin(); q != qualityHits1.end(); ++q) { - _bedA->reportBedPETab(a); - _bedB->reportBedNewLine(*q); - } - } - else if ( (numOverlapsEnd1 == 0) && (numOverlapsEnd2 > 0) ) { - for (vector<BED>::iterator q = qualityHits2.begin(); q != qualityHits2.end(); ++q) { - _bedA->reportBedPETab(a); - _bedB->reportBedNewLine(*q); - } - } - } - else if (type == "both") { - if ( (numOverlapsEnd1 > 0) && (numOverlapsEnd2 > 0) ) { - for (vector<BED>::iterator q = qualityHits1.begin(); q != qualityHits1.end(); ++q) { - _bedA->reportBedPETab(a); - _bedB->reportBedNewLine(*q); - } - for (vector<BED>::iterator q = qualityHits2.begin(); q != qualityHits2.end(); ++q) { - _bedA->reportBedPETab(a); - _bedB->reportBedNewLine(*q); - } - } - } + // list of hits on each end of BEDPE + // that exceed the requested overlap fraction + vector<BED> qualityHits1; + vector<BED> qualityHits2; + + // count of hits on each end of BEDPE + // that exceed the requested overlap fraction + int numOverlapsEnd1 = 0; + int numOverlapsEnd2 = 0; + + // make sure we have a valid chromosome before we search + if (a.chrom1 != ".") { + // Find the quality hits between ***end1*** of the BEDPE and the B BED file + _bedB->FindOverlapsPerBin(a.chrom1, a.start1, a.end1, a.strand1, hits1, _forceStrand); + + vector<BED>::const_iterator h = hits1.begin(); + vector<BED>::const_iterator hitsEnd = hits1.end(); + for (; h != hitsEnd; ++h) { + + int s = max(a.start1, h->start); + int e = min(a.end1, h->end); + int overlapBases = (e - s); // the number of overlapping bases b/w a and b + int aLength = (a.end1 - a.start1); // the length of a in b.p. + + // is there enough overlap relative to the user's request? (default ~ 1bp) + if ( ( (float) overlapBases / (float) aLength ) >= _overlapFraction ) { + numOverlapsEnd1++; + + if (type == "either") { + _bedA->reportBedPETab(a); + _bedB->reportBedNewLine(*h); + } + else { + qualityHits1.push_back(*h); + } + } + } + } + + + // make sure we have a valid chromosome before we search + if (a.chrom2 != ".") { + // Now find the quality hits between ***end2*** of the BEDPE and the B BED file + _bedB->FindOverlapsPerBin(a.chrom2, a.start2, a.end2, a.strand2, hits2, _forceStrand); + + vector<BED>::const_iterator h = hits2.begin(); + vector<BED>::const_iterator hitsEnd = hits2.end(); + for (; h != hitsEnd; ++h) { + + int s = max(a.start2, h->start); + int e = min(a.end2, h->end); + int overlapBases = (e - s); // the number of overlapping bases b/w a and b + int aLength = (a.end2 - a.start2); // the length of a in b.p. + + // is there enough overlap relative to the user's request? (default ~ 1bp) + if ( ( (float) overlapBases / (float) aLength ) >= _overlapFraction ) { + numOverlapsEnd2++; + + if (type == "either") { + _bedA->reportBedPETab(a); + _bedB->reportBedNewLine(*h); + } + else { + qualityHits2.push_back(*h); + } + } + } + } + + // Now report the hits depending on what the user has requested. + if (type == "neither") { + if ( (numOverlapsEnd1 == 0) && (numOverlapsEnd2 == 0) ) { + _bedA->reportBedPENewLine(a); + } + } + else if (type == "notboth") { + if ( (numOverlapsEnd1 == 0) && (numOverlapsEnd2 == 0) ) { + _bedA->reportBedPENewLine(a); + } + else if ( (numOverlapsEnd1 > 0) && (numOverlapsEnd2 == 0) ) { + for (vector<BED>::iterator q = qualityHits1.begin(); q != qualityHits1.end(); ++q) { + _bedA->reportBedPETab(a); + _bedB->reportBedNewLine(*q); + } + } + else if ( (numOverlapsEnd1 == 0) && (numOverlapsEnd2 > 0) ) { + for (vector<BED>::iterator q = qualityHits2.begin(); q != qualityHits2.end(); ++q) { + _bedA->reportBedPETab(a); + _bedB->reportBedNewLine(*q); + } + } + } + else if (type == "xor") { + if ( (numOverlapsEnd1 > 0) && (numOverlapsEnd2 == 0) ) { + for (vector<BED>::iterator q = qualityHits1.begin(); q != qualityHits1.end(); ++q) { + _bedA->reportBedPETab(a); + _bedB->reportBedNewLine(*q); + } + } + else if ( (numOverlapsEnd1 == 0) && (numOverlapsEnd2 > 0) ) { + for (vector<BED>::iterator q = qualityHits2.begin(); q != qualityHits2.end(); ++q) { + _bedA->reportBedPETab(a); + _bedB->reportBedNewLine(*q); + } + } + } + else if (type == "both") { + if ( (numOverlapsEnd1 > 0) && (numOverlapsEnd2 > 0) ) { + for (vector<BED>::iterator q = qualityHits1.begin(); q != qualityHits1.end(); ++q) { + _bedA->reportBedPETab(a); + _bedB->reportBedNewLine(*q); + } + for (vector<BED>::iterator q = qualityHits2.begin(); q != qualityHits2.end(); ++q) { + _bedA->reportBedPETab(a); + _bedB->reportBedNewLine(*q); + } + } + } } bool BedIntersectPE::FindOneOrMoreOverlaps(const BEDPE &a, const string &type) { - // flags for the existence of hits on each end of BEDPE - // that exceed the requested overlap fraction - bool end1Found = false; - bool end2Found = false; - - // Look for overlaps in end 1 assuming we have an aligned chromosome. - if (a.chrom1 != ".") { - end1Found = _bedB->FindOneOrMoreOverlapsPerBin(a.chrom1, a.start1, a.end1, a.strand1, - _forceStrand, _overlapFraction); - - // can we bail out without checking end2? - if ((type == "either") && (end1Found == true)) return true; - else if ((type == "neither") && (end1Found == true)) return false; - else if ((type == "notboth") && (end1Found == false)) return true; - else if ((type == "both") && (end1Found == false)) return false; - } - - // Now look for overlaps in end 2 assuming we have an aligned chromosome. - if (a.chrom2 != ".") { - end2Found = _bedB->FindOneOrMoreOverlapsPerBin(a.chrom2, a.start2, a.end2, a.strand2, - _forceStrand, _overlapFraction); - - if ((type == "either") && (end2Found == true)) return true; - else if ((type == "neither") && (end2Found == true)) return false; - else if ((type == "notboth") && (end2Found == false)) return true; - else if ((type == "both") && (end2Found == false)) return false; - } - - // Now report the hits depending on what the user has requested. - if (type == "notboth") { - if ( (end1Found == false) || (end2Found == false) ) return true; - else return false; - } - else if (type == "either") { - if ( (end1Found == false) && (end2Found == false) ) return false; - } - else if (type == "neither") { - if ( (end1Found == false) && (end2Found == false) ) return true; - else return false; - } - else if (type == "xor") { - if ( (end1Found == true) && (end2Found == false) ) return true; - else if ( (end1Found == false) && (end2Found == true) ) return true; - else return false; - } - else if (type == "both") { - if ( (end1Found == true) && (end2Found == true) ) return true; - return false; - } - return false; + // flags for the existence of hits on each end of BEDPE + // that exceed the requested overlap fraction + bool end1Found = false; + bool end2Found = false; + + // Look for overlaps in end 1 assuming we have an aligned chromosome. + if (a.chrom1 != ".") { + end1Found = _bedB->FindOneOrMoreOverlapsPerBin(a.chrom1, a.start1, a.end1, a.strand1, + _forceStrand, _overlapFraction); + + // can we bail out without checking end2? + if ((type == "either") && (end1Found == true)) return true; + else if ((type == "neither") && (end1Found == true)) return false; + else if ((type == "notboth") && (end1Found == false)) return true; + else if ((type == "both") && (end1Found == false)) return false; + } + + // Now look for overlaps in end 2 assuming we have an aligned chromosome. + if (a.chrom2 != ".") { + end2Found = _bedB->FindOneOrMoreOverlapsPerBin(a.chrom2, a.start2, a.end2, a.strand2, + _forceStrand, _overlapFraction); + + if ((type == "either") && (end2Found == true)) return true; + else if ((type == "neither") && (end2Found == true)) return false; + else if ((type == "notboth") && (end2Found == false)) return true; + else if ((type == "both") && (end2Found == false)) return false; + } + + // Now report the hits depending on what the user has requested. + if (type == "notboth") { + if ( (end1Found == false) || (end2Found == false) ) return true; + else return false; + } + else if (type == "either") { + if ( (end1Found == false) && (end2Found == false) ) return false; + } + else if (type == "neither") { + if ( (end1Found == false) && (end2Found == false) ) return true; + else return false; + } + else if (type == "xor") { + if ( (end1Found == true) && (end2Found == false) ) return true; + else if ( (end1Found == false) && (end2Found == true) ) return true; + else return false; + } + else if (type == "both") { + if ( (end1Found == true) && (end2Found == true) ) return true; + return false; + } + return false; } void BedIntersectPE::FindSpanningOverlaps(const BEDPE &a, vector<BED> &hits, const string &type) { - // count of hits on _between_ end of BEDPE - // that exceed the requested overlap fraction - int numOverlaps = 0; - CHRPOS spanStart = 0; - CHRPOS spanEnd = 0; - CHRPOS spanLength = 0; - - if ((type == "ispan") || (type == "notispan")) { - spanStart = a.end1; - spanEnd = a.start2; - if (a.end1 > a.start2) { - spanStart = a.end2; - spanEnd = a.start1; - } - } - else if ((type == "ospan") || (type == "notospan")) { - spanStart = a.start1; - spanEnd = a.end2; - if (a.start1 > a.start2) { - spanStart = a.start2; - spanEnd = a.end1; - } - } - spanLength = spanEnd - spanStart; - - // get the hits for the span - _bedB->FindOverlapsPerBin(a.chrom1, spanStart, spanEnd, a.strand1, hits, _forceStrand); - - vector<BED>::const_iterator h = hits.begin(); - vector<BED>::const_iterator hitsEnd = hits.end(); - for (; h != hitsEnd; ++h) { - - int s = max(spanStart, h->start); - int e = min(spanEnd, h->end); - int overlapBases = (e - s); // the number of overlapping bases b/w a and b - int spanLength = (spanEnd - spanStart); // the length of a in b.p. - - // is there enough overlap relative to the user's request? (default ~ 1bp) - if ( ( (float) overlapBases / (float) spanLength ) >= _overlapFraction ) { - numOverlaps++; - if ((type == "ispan") || (type == "ospan")) { - _bedA->reportBedPETab(a); - _bedB->reportBedNewLine(*h); - } - } - } - - if ( ( (type == "notispan") || (type == "notospan") ) && numOverlaps == 0 ) { - _bedA->reportBedPENewLine(a); - } + // count of hits on _between_ end of BEDPE + // that exceed the requested overlap fraction + int numOverlaps = 0; + CHRPOS spanStart = 0; + CHRPOS spanEnd = 0; + CHRPOS spanLength = 0; + + if ((type == "ispan") || (type == "notispan")) { + spanStart = a.end1; + spanEnd = a.start2; + if (a.end1 > a.start2) { + spanStart = a.end2; + spanEnd = a.start1; + } + } + else if ((type == "ospan") || (type == "notospan")) { + spanStart = a.start1; + spanEnd = a.end2; + if (a.start1 > a.start2) { + spanStart = a.start2; + spanEnd = a.end1; + } + } + spanLength = spanEnd - spanStart; + + // get the hits for the span + _bedB->FindOverlapsPerBin(a.chrom1, spanStart, spanEnd, a.strand1, hits, _forceStrand); + + vector<BED>::const_iterator h = hits.begin(); + vector<BED>::const_iterator hitsEnd = hits.end(); + for (; h != hitsEnd; ++h) { + + int s = max(spanStart, h->start); + int e = min(spanEnd, h->end); + int overlapBases = (e - s); // the number of overlapping bases b/w a and b + int spanLength = (spanEnd - spanStart); // the length of a in b.p. + + // is there enough overlap relative to the user's request? (default ~ 1bp) + if ( ( (float) overlapBases / (float) spanLength ) >= _overlapFraction ) { + numOverlaps++; + if ((type == "ispan") || (type == "ospan")) { + _bedA->reportBedPETab(a); + _bedB->reportBedNewLine(*h); + } + } + } + + if ( ( (type == "notispan") || (type == "notospan") ) && numOverlaps == 0 ) { + _bedA->reportBedPENewLine(a); + } } bool BedIntersectPE::FindOneOrMoreSpanningOverlaps(const BEDPE &a, const string &type) { - int spanStart = 0; - int spanEnd = 0; - int spanLength = 0; - bool overlapFound; - - if ((type == "ispan") || (type == "notispan")) { - spanStart = a.end1; - spanEnd = a.start2; - if (a.end1 > a.start2) { - spanStart = a.end2; - spanEnd = a.start1; - } - } - else if ((type == "ospan") || (type == "notospan")) { - spanStart = a.start1; - spanEnd = a.end2; - if (a.start1 > a.start2) { - spanStart = a.start2; - spanEnd = a.end1; - } - } - spanLength = spanEnd - spanStart; - - overlapFound = _bedB->FindOneOrMoreOverlapsPerBin(a.chrom1, spanStart, spanEnd, a.strand1, - _forceStrand, _overlapFraction); - - return overlapFound; + int spanStart = 0; + int spanEnd = 0; + int spanLength = 0; + bool overlapFound; + + if ((type == "ispan") || (type == "notispan")) { + spanStart = a.end1; + spanEnd = a.start2; + if (a.end1 > a.start2) { + spanStart = a.end2; + spanEnd = a.start1; + } + } + else if ((type == "ospan") || (type == "notospan")) { + spanStart = a.start1; + spanEnd = a.end2; + if (a.start1 > a.start2) { + spanStart = a.start2; + spanEnd = a.end1; + } + } + spanLength = spanEnd - spanStart; + + overlapFound = _bedB->FindOneOrMoreOverlapsPerBin(a.chrom1, spanStart, spanEnd, a.strand1, + _forceStrand, _overlapFraction); + + return overlapFound; } void BedIntersectPE::IntersectBedPE() { - // load the "B" bed file into a map so - // that we can easily compare "A" to it for overlaps - _bedB->loadBedFileIntoMap(); - - int lineNum = 0; // current input line number - vector<BED> hits, hits1, hits2; // vector of potential hits - - // reserve some space - hits.reserve(100); - hits1.reserve(100); - hits2.reserve(100); - - BEDPE a, nullBedPE; - BedLineStatus bedStatus; - - _bedA->Open(); - while ((bedStatus = _bedA->GetNextBedPE(a, lineNum)) != BED_INVALID) { - if (bedStatus == BED_VALID) { - if ( (_searchType == "ispan") || (_searchType == "ospan") || - (_searchType == "notispan") || (_searchType == "notospan") ) { - if (a.chrom1 == a.chrom2) { - FindSpanningOverlaps(a, hits, _searchType); - hits.clear(); - } - } - else { - FindOverlaps(a, hits1, hits2, _searchType); - hits1.clear(); - hits2.clear(); - } - a = nullBedPE; - } - } - _bedA->Close(); + // load the "B" bed file into a map so + // that we can easily compare "A" to it for overlaps + _bedB->loadBedFileIntoMap(); + + int lineNum = 0; // current input line number + vector<BED> hits, hits1, hits2; // vector of potential hits + + // reserve some space + hits.reserve(100); + hits1.reserve(100); + hits2.reserve(100); + + BEDPE a, nullBedPE; + BedLineStatus bedStatus; + + _bedA->Open(); + while ((bedStatus = _bedA->GetNextBedPE(a, lineNum)) != BED_INVALID) { + if (bedStatus == BED_VALID) { + if ( (_searchType == "ispan") || (_searchType == "ospan") || + (_searchType == "notispan") || (_searchType == "notospan") ) { + if (a.chrom1 == a.chrom2) { + FindSpanningOverlaps(a, hits, _searchType); + hits.clear(); + } + } + else { + FindOverlaps(a, hits1, hits2, _searchType); + hits1.clear(); + hits2.clear(); + } + a = nullBedPE; + } + } + _bedA->Close(); } void BedIntersectPE::IntersectBamPE(string bamFile) { - - // load the "B" bed file into a map so - // that we can easily compare "A" to it for overlaps - _bedB->loadBedFileIntoMap(); - - // open the BAM file - BamReader reader; - BamWriter writer; - reader.Open(bamFile); - - // get header & reference information - string header = reader.GetHeaderText(); - RefVector refs = reader.GetReferenceData(); - - // open a BAM output to stdout if we are writing BAM - if (_bamOutput == true) { - // open our BAM writer - writer.Open("stdout", header, refs, _isUncompressedBam); - } - - // track the previous and current sequence - // names so that we can identify blocks of - // alignments for a given read ID. - string prevName, currName; - prevName = currName = ""; - - vector<BamAlignment> alignments; // vector of BAM alignments for a given ID in a BAM file. - alignments.reserve(100); - - _bedA->bedType = 10; // it's a full BEDPE given it's BAM - - // rip through the BAM file and convert each mapped entry to BEDPE - BamAlignment bam1, bam2; - while (reader.GetNextAlignment(bam1)) { - // the alignment must be paired - if (bam1.IsPaired() == true) { - // grab the second alignment for the pair. - reader.GetNextAlignment(bam2); - - // require that the alignments are from the same query - if (bam1.Name == bam2.Name) { - ProcessBamBlock(bam1, bam2, refs, writer); - } - else { - cerr << "*****ERROR: -bedpe requires BAM to be sorted or grouped by query name. " << endl; - exit(1); - } - } - } - // close up - reader.Close(); - if (_bamOutput == true) { - writer.Close(); - } + + // load the "B" bed file into a map so + // that we can easily compare "A" to it for overlaps + _bedB->loadBedFileIntoMap(); + + // open the BAM file + BamReader reader; + BamWriter writer; + reader.Open(bamFile); + + // get header & reference information + string header = reader.GetHeaderText(); + RefVector refs = reader.GetReferenceData(); + + // open a BAM output to stdout if we are writing BAM + if (_bamOutput == true) { + // open our BAM writer + writer.Open("stdout", header, refs, _isUncompressedBam); + } + + // track the previous and current sequence + // names so that we can identify blocks of + // alignments for a given read ID. + string prevName, currName; + prevName = currName = ""; + + vector<BamAlignment> alignments; // vector of BAM alignments for a given ID in a BAM file. + alignments.reserve(100); + + _bedA->bedType = 10; // it's a full BEDPE given it's BAM + + // rip through the BAM file and convert each mapped entry to BEDPE + BamAlignment bam1, bam2; + while (reader.GetNextAlignment(bam1)) { + // the alignment must be paired + if (bam1.IsPaired() == true) { + // grab the second alignment for the pair. + reader.GetNextAlignment(bam2); + + // require that the alignments are from the same query + if (bam1.Name == bam2.Name) { + ProcessBamBlock(bam1, bam2, refs, writer); + } + else { + cerr << "*****ERROR: -bedpe requires BAM to be sorted or grouped by query name. " << endl; + exit(1); + } + } + } + // close up + reader.Close(); + if (_bamOutput == true) { + writer.Close(); + } } -void BedIntersectPE::ProcessBamBlock (const BamAlignment &bam1, const BamAlignment &bam2, +void BedIntersectPE::ProcessBamBlock (const BamAlignment &bam1, const BamAlignment &bam2, const RefVector &refs, BamWriter &writer) { - - vector<BED> hits, hits1, hits2; // vector of potential hits - hits.reserve(1000); // reserve some space - hits1.reserve(1000); - hits2.reserve(1000); - - bool overlapsFound; // flag to indicate if overlaps were found - - if ( (_searchType == "either") || (_searchType == "xor") || - (_searchType == "both") || (_searchType == "notboth") || - (_searchType == "neither") ) { - - // create a new BEDPE feature from the BAM alignments. - BEDPE a; - ConvertBamToBedPE(bam1, bam2, refs, a); - if (_bamOutput == true) { // BAM output - // write to BAM if correct hits found - overlapsFound = FindOneOrMoreOverlaps(a, _searchType); - if (overlapsFound == true) { - writer.SaveAlignment(bam1); - writer.SaveAlignment(bam2); - } - } - else { // BEDPE output - FindOverlaps(a, hits1, hits2, _searchType); - hits1.clear(); - hits2.clear(); - } - } - else if ( (_searchType == "ispan") || (_searchType == "ospan") ) { - // only look for ispan and ospan when both ends are mapped. - if (bam1.IsMapped() && bam2.IsMapped()) { - // only do an inspan or outspan check if the alignment is intrachromosomal - if (bam1.RefID == bam2.RefID) { - // create a new BEDPE feature from the BAM alignments. - BEDPE a; - ConvertBamToBedPE(bam1, bam2, refs, a); - if (_bamOutput == true) { // BAM output - // look for overlaps, and write to BAM if >=1 were found - overlapsFound = FindOneOrMoreSpanningOverlaps(a, _searchType); - if (overlapsFound == true) { - writer.SaveAlignment(bam1); - writer.SaveAlignment(bam2); - } - } - else { // BEDPE output - FindSpanningOverlaps(a, hits, _searchType); - hits.clear(); - } - } - } - } - else if ( (_searchType == "notispan") || (_searchType == "notospan") ) { - // only look for notispan and notospan when both ends are mapped. - if (bam1.IsMapped() && bam2.IsMapped()) { - // only do an inspan or outspan check if the alignment is intrachromosomal - if (bam1.RefID == bam2.RefID) { - // create a new BEDPE feature from the BAM alignments. - BEDPE a; - ConvertBamToBedPE(bam1, bam2, refs, a); - if (_bamOutput == true) { // BAM output - // write to BAM if there were no overlaps - overlapsFound = FindOneOrMoreSpanningOverlaps(a, _searchType); - if (overlapsFound == false) { - writer.SaveAlignment(bam1); - writer.SaveAlignment(bam2); - } - } - else { // BEDPE output - FindSpanningOverlaps(a, hits, _searchType); - hits.clear(); - } - } - // if inter-chromosomal or orphaned, we know it's not ispan and not ospan - else if (_bamOutput == true) { - writer.SaveAlignment(bam1); - writer.SaveAlignment(bam2); - } - } - // if both ends aren't mapped, we know that it's notispan and not ospan - else if (_bamOutput == true) { - writer.SaveAlignment(bam1); - writer.SaveAlignment(bam2); - } - } + + vector<BED> hits, hits1, hits2; // vector of potential hits + hits.reserve(1000); // reserve some space + hits1.reserve(1000); + hits2.reserve(1000); + + bool overlapsFound; // flag to indicate if overlaps were found + + if ( (_searchType == "either") || (_searchType == "xor") || + (_searchType == "both") || (_searchType == "notboth") || + (_searchType == "neither") ) { + + // create a new BEDPE feature from the BAM alignments. + BEDPE a; + ConvertBamToBedPE(bam1, bam2, refs, a); + if (_bamOutput == true) { // BAM output + // write to BAM if correct hits found + overlapsFound = FindOneOrMoreOverlaps(a, _searchType); + if (overlapsFound == true) { + writer.SaveAlignment(bam1); + writer.SaveAlignment(bam2); + } + } + else { // BEDPE output + FindOverlaps(a, hits1, hits2, _searchType); + hits1.clear(); + hits2.clear(); + } + } + else if ( (_searchType == "ispan") || (_searchType == "ospan") ) { + // only look for ispan and ospan when both ends are mapped. + if (bam1.IsMapped() && bam2.IsMapped()) { + // only do an inspan or outspan check if the alignment is intrachromosomal + if (bam1.RefID == bam2.RefID) { + // create a new BEDPE feature from the BAM alignments. + BEDPE a; + ConvertBamToBedPE(bam1, bam2, refs, a); + if (_bamOutput == true) { // BAM output + // look for overlaps, and write to BAM if >=1 were found + overlapsFound = FindOneOrMoreSpanningOverlaps(a, _searchType); + if (overlapsFound == true) { + writer.SaveAlignment(bam1); + writer.SaveAlignment(bam2); + } + } + else { // BEDPE output + FindSpanningOverlaps(a, hits, _searchType); + hits.clear(); + } + } + } + } + else if ( (_searchType == "notispan") || (_searchType == "notospan") ) { + // only look for notispan and notospan when both ends are mapped. + if (bam1.IsMapped() && bam2.IsMapped()) { + // only do an inspan or outspan check if the alignment is intrachromosomal + if (bam1.RefID == bam2.RefID) { + // create a new BEDPE feature from the BAM alignments. + BEDPE a; + ConvertBamToBedPE(bam1, bam2, refs, a); + if (_bamOutput == true) { // BAM output + // write to BAM if there were no overlaps + overlapsFound = FindOneOrMoreSpanningOverlaps(a, _searchType); + if (overlapsFound == false) { + writer.SaveAlignment(bam1); + writer.SaveAlignment(bam2); + } + } + else { // BEDPE output + FindSpanningOverlaps(a, hits, _searchType); + hits.clear(); + } + } + // if inter-chromosomal or orphaned, we know it's not ispan and not ospan + else if (_bamOutput == true) { + writer.SaveAlignment(bam1); + writer.SaveAlignment(bam2); + } + } + // if both ends aren't mapped, we know that it's notispan and not ospan + else if (_bamOutput == true) { + writer.SaveAlignment(bam1); + writer.SaveAlignment(bam2); + } + } } diff --git a/src/pairToBed/pairToBed.h b/src/pairToBed/pairToBed.h index a52380a05fda11afab1454f4c807b54e57ef622b..d168a236980b1df57f6ad9825a665438a31a5091 100644 --- a/src/pairToBed/pairToBed.h +++ b/src/pairToBed/pairToBed.h @@ -41,121 +41,121 @@ class BedIntersectPE { public: - // constructor - BedIntersectPE(string bedAFilePE, string bedBFile, float overlapFraction, - string searchType, bool forceStrand, bool bamInput, bool bamOutput, bool uncompressedBam, bool useEditDistance); + // constructor + BedIntersectPE(string bedAFilePE, string bedBFile, float overlapFraction, + string searchType, bool forceStrand, bool bamInput, bool bamOutput, bool uncompressedBam, bool useEditDistance); - // destructor - ~BedIntersectPE(void); + // destructor + ~BedIntersectPE(void); - void FindOverlaps(const BEDPE &, vector<BED> &hits1, vector<BED> &hits2, const string &type); - - bool FindOneOrMoreOverlaps(const BEDPE &, const string &type); + void FindOverlaps(const BEDPE &, vector<BED> &hits1, vector<BED> &hits2, const string &type); - void FindSpanningOverlaps(const BEDPE &a, vector<BED> &hits, const string &type); - bool FindOneOrMoreSpanningOverlaps(const BEDPE &a, const string &type); + bool FindOneOrMoreOverlaps(const BEDPE &, const string &type); + + void FindSpanningOverlaps(const BEDPE &a, vector<BED> &hits, const string &type); + bool FindOneOrMoreSpanningOverlaps(const BEDPE &a, const string &type); + + void IntersectBedPE(); + void IntersectBamPE(string bamFile); + + void DetermineBedPEInput(); - void IntersectBedPE(); - void IntersectBamPE(string bamFile); - - void DetermineBedPEInput(); - private: - string _bedAFilePE; - string _bedBFile; - float _overlapFraction; - string _searchType; - bool _forceStrand; - bool _useEditDistance; - bool _bamInput; - bool _bamOutput; - bool _isUncompressedBam; + string _bedAFilePE; + string _bedBFile; + float _overlapFraction; + string _searchType; + bool _forceStrand; + bool _useEditDistance; + bool _bamInput; + bool _bamOutput; + bool _isUncompressedBam; - // instance of a paired-end bed file class. - BedFilePE *_bedA; + // instance of a paired-end bed file class. + BedFilePE *_bedA; - // instance of a bed file class. - BedFile *_bedB; + // instance of a bed file class. + BedFile *_bedB; - inline - void ConvertBamToBedPE(const BamAlignment &bam1, const BamAlignment &bam2, const RefVector &refs, BEDPE &a) { + inline + void ConvertBamToBedPE(const BamAlignment &bam1, const BamAlignment &bam2, const RefVector &refs, BEDPE &a) { - // initialize BEDPE variables - a.start1 = a.start2 = a.end1 = a.end2 = -1; - a.chrom1 = a.chrom2 = "."; + // initialize BEDPE variables + a.start1 = a.start2 = a.end1 = a.end2 = -1; + a.chrom1 = a.chrom2 = "."; a.strand1 = a.strand2 = '.'; - uint32_t editDistance1, editDistance2; - editDistance1 = editDistance2 = 0; - - // take the qname from end 1. - a.name = bam1.Name; - - // end 1 - if (bam1.IsMapped()) { - a.chrom1 = refs.at(bam1.RefID).RefName; - a.start1 = bam1.Position; - a.end1 = bam1.GetEndPosition(); - a.strand1 = "+"; - if (bam1.IsReverseStrand()) a.strand1 = "-"; - - // extract the edit distance from the NM tag - // if possible. otherwise, complain. - if (_useEditDistance == true) { - if (bam1.GetTag("NM", editDistance1) == false) { - cerr << "The edit distance tag (NM) was not found in the BAM file. Please disable -ed. Exiting\n"; - exit(1); - } - } - } - - // end 2 - if (bam2.IsMapped()) { - a.chrom2 = refs.at(bam2.RefID).RefName; - a.start2 = bam2.Position; - a.end2 = bam2.GetEndPosition(); - a.strand2 = "+"; - if (bam2.IsReverseStrand()) a.strand2 = "-"; - - // extract the edit distance from the NM tag - // if possible. otherwise, complain. - if (_useEditDistance == true) { - if (bam2.GetTag("NM", editDistance2) == false) { - cerr << "The edit distance tag (NM) was not found in the BAM file. Please disable -ed. Exiting\n"; - exit(1); - } - } - } - - // swap the ends if necessary - if ( a.chrom1 > a.chrom2 || ((a.chrom1 == a.chrom2) && (a.start1 > a.start2)) ) { - swap(a.chrom1, a.chrom2); - swap(a.start1, a.start2); - swap(a.end1, a.end2); - swap(a.strand1, a.strand2); - } - - // compute the minimum mapping quality b/w the two ends of the pair. - a.score = "0"; - if (_useEditDistance == false) { - if (bam1.IsMapped() == true && bam2.IsMapped() == true) - a.score = ToString(min(bam1.MapQuality, bam2.MapQuality)); - } - // BEDPE using edit distance - else { - if (bam1.IsMapped() == true && bam2.IsMapped() == true) - a.score = ToString((int) (editDistance1 + editDistance2)); - else if (bam1.IsMapped() == true) - a.score = ToString((int) editDistance1); - else if (bam2.IsMapped() == true) - a.score = ToString((int) editDistance2); - } - }; - - inline - void ProcessBamBlock (const BamAlignment &bam1, const BamAlignment &bam2, - const RefVector &refs, - BamWriter &writer); + uint32_t editDistance1, editDistance2; + editDistance1 = editDistance2 = 0; + + // take the qname from end 1. + a.name = bam1.Name; + + // end 1 + if (bam1.IsMapped()) { + a.chrom1 = refs.at(bam1.RefID).RefName; + a.start1 = bam1.Position; + a.end1 = bam1.GetEndPosition(); + a.strand1 = "+"; + if (bam1.IsReverseStrand()) a.strand1 = "-"; + + // extract the edit distance from the NM tag + // if possible. otherwise, complain. + if (_useEditDistance == true) { + if (bam1.GetTag("NM", editDistance1) == false) { + cerr << "The edit distance tag (NM) was not found in the BAM file. Please disable -ed. Exiting\n"; + exit(1); + } + } + } + + // end 2 + if (bam2.IsMapped()) { + a.chrom2 = refs.at(bam2.RefID).RefName; + a.start2 = bam2.Position; + a.end2 = bam2.GetEndPosition(); + a.strand2 = "+"; + if (bam2.IsReverseStrand()) a.strand2 = "-"; + + // extract the edit distance from the NM tag + // if possible. otherwise, complain. + if (_useEditDistance == true) { + if (bam2.GetTag("NM", editDistance2) == false) { + cerr << "The edit distance tag (NM) was not found in the BAM file. Please disable -ed. Exiting\n"; + exit(1); + } + } + } + + // swap the ends if necessary + if ( a.chrom1 > a.chrom2 || ((a.chrom1 == a.chrom2) && (a.start1 > a.start2)) ) { + swap(a.chrom1, a.chrom2); + swap(a.start1, a.start2); + swap(a.end1, a.end2); + swap(a.strand1, a.strand2); + } + + // compute the minimum mapping quality b/w the two ends of the pair. + a.score = "0"; + if (_useEditDistance == false) { + if (bam1.IsMapped() == true && bam2.IsMapped() == true) + a.score = ToString(min(bam1.MapQuality, bam2.MapQuality)); + } + // BEDPE using edit distance + else { + if (bam1.IsMapped() == true && bam2.IsMapped() == true) + a.score = ToString((int) (editDistance1 + editDistance2)); + else if (bam1.IsMapped() == true) + a.score = ToString((int) editDistance1); + else if (bam2.IsMapped() == true) + a.score = ToString((int) editDistance2); + } + }; + + inline + void ProcessBamBlock (const BamAlignment &bam1, const BamAlignment &bam2, + const RefVector &refs, + BamWriter &writer); }; #endif /* PEINTERSECTBED_H */ diff --git a/src/pairToBed/pairToBedMain.cpp b/src/pairToBed/pairToBedMain.cpp index a57177f392a91be3fc02f2358c4dd905118776f1..068e6e3db1755a811f303edd844e2755693ec5c0 100644 --- a/src/pairToBed/pairToBedMain.cpp +++ b/src/pairToBed/pairToBedMain.cpp @@ -25,201 +25,201 @@ void ShowHelp(void); int main(int argc, char* argv[]) { - // our configuration variables - bool showHelp = false; - - // input files - string bedAFile; - string bedBFile; - - // input arguments - float overlapFraction = 1E-9; - string searchType = "either"; - - // flags to track parameters - bool haveBedA = false; - bool haveBedB = false; - bool haveSearchType = false; - bool haveFraction = false; - bool forceStrand = false; - bool useEditDistance = false; - bool inputIsBam = false; - bool outputIsBam = true; - bool uncompressedBam = false; - - // check to see if we should print out some help - if(argc <= 1) showHelp = true; - - for(int i = 1; i < argc; i++) { - int parameterLength = (int)strlen(argv[i]); - - if((PARAMETER_CHECK("-h", 2, parameterLength)) || - (PARAMETER_CHECK("--help", 5, parameterLength))) { - showHelp = true; - } - } - - if(showHelp) ShowHelp(); - - // do some parsing (all of these parameters require 2 strings) - for(int i = 1; i < argc; i++) { - - int parameterLength = (int)strlen(argv[i]); - - if(PARAMETER_CHECK("-a", 2, parameterLength)) { - if ((i+1) < argc) { - haveBedA = true; - outputIsBam = false; - bedAFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-abam", 5, parameterLength)) { - if ((i+1) < argc) { - haveBedA = true; - inputIsBam = true; - bedAFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-b", 2, parameterLength)) { - if ((i+1) < argc) { - haveBedB = true; - bedBFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-bedpe", 6, parameterLength)) { - outputIsBam = false; - } - else if(PARAMETER_CHECK("-ed", 3, parameterLength)) { - useEditDistance = true; - } - else if(PARAMETER_CHECK("-type", 5, parameterLength)) { - if ((i+1) < argc) { - haveSearchType = true; - searchType = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-f", 2, parameterLength)) { - if ((i+1) < argc) { - haveFraction = true; - overlapFraction = atof(argv[i + 1]); - i++; - } - } - else if (PARAMETER_CHECK("-s", 2, parameterLength)) { - forceStrand = true; - } - else if(PARAMETER_CHECK("-ubam", 5, parameterLength)) { + // our configuration variables + bool showHelp = false; + + // input files + string bedAFile; + string bedBFile; + + // input arguments + float overlapFraction = 1E-9; + string searchType = "either"; + + // flags to track parameters + bool haveBedA = false; + bool haveBedB = false; + bool haveSearchType = false; + bool haveFraction = false; + bool forceStrand = false; + bool useEditDistance = false; + bool inputIsBam = false; + bool outputIsBam = true; + bool uncompressedBam = false; + + // check to see if we should print out some help + if(argc <= 1) showHelp = true; + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if((PARAMETER_CHECK("-h", 2, parameterLength)) || + (PARAMETER_CHECK("--help", 5, parameterLength))) { + showHelp = true; + } + } + + if(showHelp) ShowHelp(); + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-a", 2, parameterLength)) { + if ((i+1) < argc) { + haveBedA = true; + outputIsBam = false; + bedAFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-abam", 5, parameterLength)) { + if ((i+1) < argc) { + haveBedA = true; + inputIsBam = true; + bedAFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-b", 2, parameterLength)) { + if ((i+1) < argc) { + haveBedB = true; + bedBFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-bedpe", 6, parameterLength)) { + outputIsBam = false; + } + else if(PARAMETER_CHECK("-ed", 3, parameterLength)) { + useEditDistance = true; + } + else if(PARAMETER_CHECK("-type", 5, parameterLength)) { + if ((i+1) < argc) { + haveSearchType = true; + searchType = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-f", 2, parameterLength)) { + if ((i+1) < argc) { + haveFraction = true; + overlapFraction = atof(argv[i + 1]); + i++; + } + } + else if (PARAMETER_CHECK("-s", 2, parameterLength)) { + forceStrand = true; + } + else if(PARAMETER_CHECK("-ubam", 5, parameterLength)) { uncompressedBam = true; - } - else { - cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; - showHelp = true; - } - } - - - // make sure we have both input files - if (!haveBedA || !haveBedB) { - cerr << endl << "*****" << endl << "*****ERROR: Need -a and -b files. " << endl << "*****" << endl; - showHelp = true; - } - - if (haveSearchType && (searchType != "either") && (searchType != "neither") && (searchType != "both") - && (searchType != "xor") && (searchType != "notboth") && (searchType != "ispan") - && (searchType != "ospan") && (searchType != "notispan") && (searchType != "notospan")) { - cerr << endl << "*****" << endl << "*****ERROR: Request \"either\" or \"both\" or \"neither\" or \"xor\" or \"notboth\" or \"ispan\" or \"ospan\" or \"notispan\" or \"notospan\"" << endl << "*****" << endl; - showHelp = true; - } - - if ( ((searchType == "ispan") || (searchType == "ospan") || (searchType == "notispan") || (searchType == "notospan")) - && forceStrand ) { - cerr << endl << "*****" << endl << "*****ERROR: Cannot enforce strandedness with selected searchtype" << endl << "*****" << endl; - showHelp = true; - } - - if (useEditDistance && (inputIsBam == false || outputIsBam == true)) { - cerr << endl << "*****" << endl << "*****ERROR: -ed must be used with -bedpe and -abam." << endl << "*****" << endl; - showHelp = true; - } - - if (!showHelp) { - - BedIntersectPE *bi = new BedIntersectPE(bedAFile, bedBFile, overlapFraction, - searchType, forceStrand, inputIsBam, - outputIsBam, uncompressedBam, useEditDistance); - delete bi; - return 0; - } - else { - ShowHelp(); - } + } + else { + cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; + showHelp = true; + } + } + + + // make sure we have both input files + if (!haveBedA || !haveBedB) { + cerr << endl << "*****" << endl << "*****ERROR: Need -a and -b files. " << endl << "*****" << endl; + showHelp = true; + } + + if (haveSearchType && (searchType != "either") && (searchType != "neither") && (searchType != "both") + && (searchType != "xor") && (searchType != "notboth") && (searchType != "ispan") + && (searchType != "ospan") && (searchType != "notispan") && (searchType != "notospan")) { + cerr << endl << "*****" << endl << "*****ERROR: Request \"either\" or \"both\" or \"neither\" or \"xor\" or \"notboth\" or \"ispan\" or \"ospan\" or \"notispan\" or \"notospan\"" << endl << "*****" << endl; + showHelp = true; + } + + if ( ((searchType == "ispan") || (searchType == "ospan") || (searchType == "notispan") || (searchType == "notospan")) + && forceStrand ) { + cerr << endl << "*****" << endl << "*****ERROR: Cannot enforce strandedness with selected searchtype" << endl << "*****" << endl; + showHelp = true; + } + + if (useEditDistance && (inputIsBam == false || outputIsBam == true)) { + cerr << endl << "*****" << endl << "*****ERROR: -ed must be used with -bedpe and -abam." << endl << "*****" << endl; + showHelp = true; + } + + if (!showHelp) { + + BedIntersectPE *bi = new BedIntersectPE(bedAFile, bedBFile, overlapFraction, + searchType, forceStrand, inputIsBam, + outputIsBam, uncompressedBam, useEditDistance); + delete bi; + return 0; + } + else { + ShowHelp(); + } } void ShowHelp(void) { - cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; - - cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; - - cerr << "Summary: Report overlaps between a BEDPE file and a BED/GFF/VCF file." << endl << endl; - - cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <bedpe> -b <bed/gff/vcf>" << endl << endl; - - cerr << "Options: " << endl; - - cerr << "\t-abam\t" << "The A input file is in BAM format. Output will be BAM as well." << endl; - cerr << "\t\t- Requires BAM to be grouped or sorted by query." << endl << endl; - - cerr << "\t-ubam\t" << "Write uncompressed BAM output. Default is to write compressed BAM." << endl << endl; - cerr << "\t\tis to write output in BAM when using -abam." << endl << endl; - - cerr << "\t-bedpe\t" << "When using BAM input (-abam), write output as BEDPE. The default" << endl; - cerr << "\t\tis to write output in BAM when using -abam." << endl << endl; - - cerr << "\t-ed\t" << "Use BAM total edit distance (NM tag) for BEDPE score." << endl; - cerr << "\t\t- Default for BEDPE is to use the minimum of" << endl; - cerr << "\t\t of the two mapping qualities for the pair." << endl; - cerr << "\t\t- When -ed is used the total edit distance" << endl; - cerr << "\t\t from the two mates is reported as the score." << endl << endl; - - cerr << "\t-f\t" << "Minimum overlap required as fraction of A (e.g. 0.05)." << endl; - cerr << "\t\tDefault is 1E-9 (effectively 1bp)." << endl << endl; - - cerr << "\t-s\t" << "Enforce strandedness when finding overlaps." << endl; - cerr << "\t\tDefault is to ignore stand." << endl; - cerr << "\t\tNot applicable with -type inspan or -type outspan." << endl << endl; - - cerr << "\t-type \t" << "Approach to reporting overlaps between BEDPE and BED." << endl << endl; - cerr << "\t\teither\tReport overlaps if either end of A overlaps B." << endl; - cerr << "\t\t\t- Default." << endl; - - cerr << "\t\tneither\tReport A if neither end of A overlaps B." << endl; - - cerr << "\t\tboth\tReport overlaps if both ends of A overlap B." << endl; - - cerr << "\t\txor\tReport overlaps if one and only one end of A overlaps B." << endl; - - cerr << "\t\tnotboth\tReport overlaps if neither end or one and only one " << endl; - cerr << "\t\t\tend of A overlap B. That is, xor + neither." << endl << endl; - - cerr << "\t\tispan\tReport overlaps between [end1, start2] of A and B." << endl; - cerr << "\t\t\t- Note: If chrom1 <> chrom2, entry is ignored." << endl << endl; - - cerr << "\t\tospan\tReport overlaps between [start1, end2] of A and B." << endl; - cerr << "\t\t\t- Note: If chrom1 <> chrom2, entry is ignored." << endl << endl; - - cerr << "\t\tnotispan\tReport A if ispan of A doesn't overlap B." << endl; - cerr << "\t\t\t\t- Note: If chrom1 <> chrom2, entry is ignored." << endl << endl; - - cerr << "\t\tnotospan\tReport A if ospan of A doesn't overlap B." << endl; - cerr << "\t\t\t\t- Note: If chrom1 <> chrom2, entry is ignored." << endl << endl; - - cerr << "Refer to the BEDTools manual for BEDPE format." << endl << endl; - - exit(1); + cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; + + cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; + + cerr << "Summary: Report overlaps between a BEDPE file and a BED/GFF/VCF file." << endl << endl; + + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <bedpe> -b <bed/gff/vcf>" << endl << endl; + + cerr << "Options: " << endl; + + cerr << "\t-abam\t" << "The A input file is in BAM format. Output will be BAM as well." << endl; + cerr << "\t\t- Requires BAM to be grouped or sorted by query." << endl << endl; + + cerr << "\t-ubam\t" << "Write uncompressed BAM output. Default is to write compressed BAM." << endl << endl; + cerr << "\t\tis to write output in BAM when using -abam." << endl << endl; + + cerr << "\t-bedpe\t" << "When using BAM input (-abam), write output as BEDPE. The default" << endl; + cerr << "\t\tis to write output in BAM when using -abam." << endl << endl; + + cerr << "\t-ed\t" << "Use BAM total edit distance (NM tag) for BEDPE score." << endl; + cerr << "\t\t- Default for BEDPE is to use the minimum of" << endl; + cerr << "\t\t of the two mapping qualities for the pair." << endl; + cerr << "\t\t- When -ed is used the total edit distance" << endl; + cerr << "\t\t from the two mates is reported as the score." << endl << endl; + + cerr << "\t-f\t" << "Minimum overlap required as fraction of A (e.g. 0.05)." << endl; + cerr << "\t\tDefault is 1E-9 (effectively 1bp)." << endl << endl; + + cerr << "\t-s\t" << "Enforce strandedness when finding overlaps." << endl; + cerr << "\t\tDefault is to ignore stand." << endl; + cerr << "\t\tNot applicable with -type inspan or -type outspan." << endl << endl; + + cerr << "\t-type \t" << "Approach to reporting overlaps between BEDPE and BED." << endl << endl; + cerr << "\t\teither\tReport overlaps if either end of A overlaps B." << endl; + cerr << "\t\t\t- Default." << endl; + + cerr << "\t\tneither\tReport A if neither end of A overlaps B." << endl; + + cerr << "\t\tboth\tReport overlaps if both ends of A overlap B." << endl; + + cerr << "\t\txor\tReport overlaps if one and only one end of A overlaps B." << endl; + + cerr << "\t\tnotboth\tReport overlaps if neither end or one and only one " << endl; + cerr << "\t\t\tend of A overlap B. That is, xor + neither." << endl << endl; + + cerr << "\t\tispan\tReport overlaps between [end1, start2] of A and B." << endl; + cerr << "\t\t\t- Note: If chrom1 <> chrom2, entry is ignored." << endl << endl; + + cerr << "\t\tospan\tReport overlaps between [start1, end2] of A and B." << endl; + cerr << "\t\t\t- Note: If chrom1 <> chrom2, entry is ignored." << endl << endl; + + cerr << "\t\tnotispan\tReport A if ispan of A doesn't overlap B." << endl; + cerr << "\t\t\t\t- Note: If chrom1 <> chrom2, entry is ignored." << endl << endl; + + cerr << "\t\tnotospan\tReport A if ospan of A doesn't overlap B." << endl; + cerr << "\t\t\t\t- Note: If chrom1 <> chrom2, entry is ignored." << endl << endl; + + cerr << "Refer to the BEDTools manual for BEDPE format." << endl << endl; + + exit(1); } diff --git a/src/pairToPair/pairToPair.cpp b/src/pairToPair/pairToPair.cpp index 5f6b4c83657c4329ffaa7fd850857a248f8397f2..4241c44b8d2f8e13c26164c946cadd4b09920a55 100644 --- a/src/pairToPair/pairToPair.cpp +++ b/src/pairToPair/pairToPair.cpp @@ -14,28 +14,29 @@ /* - Constructor + Constructor */ -PairToPair::PairToPair(string &bedAFilePE, string &bedBFilePE, float &overlapFraction, - string searchType, bool ignoreStrand, int slop, bool strandedSlop) { - - _bedAFilePE = bedAFilePE; - _bedBFilePE = bedBFilePE; - _overlapFraction = overlapFraction; - _searchType = searchType; - _ignoreStrand = ignoreStrand; +PairToPair::PairToPair(string &bedAFilePE, string &bedBFilePE, float &overlapFraction, + string searchType, bool ignoreStrand, bool reqDiffNames, int slop, bool strandedSlop) { + + _bedAFilePE = bedAFilePE; + _bedBFilePE = bedBFilePE; + _overlapFraction = overlapFraction; + _searchType = searchType; + _ignoreStrand = ignoreStrand; + _reqDiffNames = reqDiffNames; _slop = slop; _strandedSlop = strandedSlop; - - _bedA = new BedFilePE(bedAFilePE); - _bedB = new BedFilePE(bedBFilePE); - - IntersectPairs(); + + _bedA = new BedFilePE(bedAFilePE); + _bedB = new BedFilePE(bedBFilePE); + + IntersectPairs(); } /* - Destructor + Destructor */ PairToPair::~PairToPair(void) { } @@ -43,68 +44,45 @@ PairToPair::~PairToPair(void) { void PairToPair::IntersectPairs() { - - // load the "B" bed file into a map so - // that we can easily compare "A" to it for overlaps - _bedB->loadBedPEFileIntoMap(); - - int lineNum = 0; - vector<MATE> hitsA1B1, hitsA1B2, hitsA2B1, hitsA2B2; - // reserve some space - hitsA1B1.reserve(100); hitsA1B2.reserve(100); hitsA2B1.reserve(100); hitsA2B2.reserve(100); - - BedLineStatus bedStatus; - BEDPE a, nullBedPE; - - _bedA->Open(); - while ((bedStatus = _bedA->GetNextBedPE(a, lineNum)) != BED_INVALID) { - if (bedStatus == BED_VALID) { + + // load the "B" bed file into a map so + // that we can easily compare "A" to it for overlaps + _bedB->loadBedPEFileIntoMap(); + + int lineNum = 0; + BedLineStatus bedStatus; + BEDPE a, nullBedPE; + + _bedA->Open(); + while ((bedStatus = _bedA->GetNextBedPE(a, lineNum)) != BED_INVALID) { + if (bedStatus == BED_VALID) { // identify overlaps b/w the pairs - FindOverlaps(a, hitsA1B1, hitsA1B2, hitsA2B1, hitsA2B2); - - // reset space for next BEDPE - hitsA1B1.clear(); hitsA1B2.clear(); hitsA2B1.clear(); hitsA2B2.clear(); - a = nullBedPE; - } - } - _bedA->Close(); + FindOverlaps(a); + a = nullBedPE; + } + } + _bedA->Close(); } // END IntersectPE -void PairToPair::FindOverlaps(const BEDPE &a, - vector<MATE> &hitsA1B1, - vector<MATE> &hitsA1B2, - vector<MATE> &hitsA2B1, - vector<MATE> &hitsA2B2) { - - // list of hits on each end of BEDPE - // that exceed the requested overlap fraction - vector<MATE> qualityHitsA1B1; - vector<MATE> qualityHitsA1B2; - vector<MATE> qualityHitsA2B1; - vector<MATE> qualityHitsA2B2; - - // count of hits on each end of BEDPE - // that exceed the requested overlap fraction - int numOverlapsA1B1 = 0; - int numOverlapsA1B2 = 0; - int numOverlapsA2B1 = 0; - int numOverlapsA2B2 = 0; - - // add the appropriate slop to the starts and ends +void PairToPair::FindOverlaps(const BEDPE &a) { + // + vector<MATE> hitsA1B1, hitsA1B2, hitsA2B1, hitsA2B2; + + // add the appropriate slop to the starts and ends CHRPOS start1 = a.start1; CHRPOS end1 = a.end1; CHRPOS start2 = a.start2; CHRPOS end2 = a.end2; - + if (_strandedSlop == true) { - if (a.strand1 == "+") + if (a.strand1 == "+") end1 += _slop; else start1 -= _slop; - if (a.strand2 == "+") + if (a.strand2 == "+") end2 += _slop; else start2 -= _slop; @@ -113,142 +91,109 @@ void PairToPair::FindOverlaps(const BEDPE &a, start1 -= _slop; start2 -= _slop; end1 += _slop; - end2 += _slop; + end2 += _slop; } - // Find the _potential_ hits between each end of A and B - _bedB->FindOverlapsPerBin(1, a.chrom1, start1, end1, a.strand1, hitsA1B1, !(_ignoreStrand)); // hits b/w A1 & B1 - _bedB->FindOverlapsPerBin(1, a.chrom2, start2, end2, a.strand2, hitsA2B1, !(_ignoreStrand)); // hits b/w A2 & B1 - _bedB->FindOverlapsPerBin(2, a.chrom1, start1, end1, a.strand1, hitsA1B2, !(_ignoreStrand)); // hits b/w A1 & B2 - _bedB->FindOverlapsPerBin(2, a.chrom2, start2, end2, a.strand2, hitsA2B2, !(_ignoreStrand)); // hits b/w A2 & B2 - - // Now, reduce to the set of hits on each end of A and B - // that meet the required overlap fraction and orientation. - // FindQualityHitsBetweenEnds(start1, end1, start2, end2, hitsA1B1, qualityHitsA1B1, numOverlapsA1B1); // quality hits b/w A1 & B1 - // FindQualityHitsBetweenEnds(start1, end1, start2, end2, hitsA1B2, qualityHitsA1B2, numOverlapsA1B2); // quality hits b/w A1 & B2 - // FindQualityHitsBetweenEnds(start1, end1, start2, end2, hitsA2B1, qualityHitsA2B1, numOverlapsA2B1); // quality hits b/w A2 & B1 - // FindQualityHitsBetweenEnds(start1, end1, start2, end2, hitsA2B2, qualityHitsA2B2, numOverlapsA2B2); // quality hits b/w A2 & B2 - FindQualityHitsBetweenEnds(start1, end1, hitsA1B1, qualityHitsA1B1, numOverlapsA1B1); // quality hits b/w A1 & B1 - FindQualityHitsBetweenEnds(start1, end1, hitsA1B2, qualityHitsA1B2, numOverlapsA1B2); // quality hits b/w A1 & B2 - FindQualityHitsBetweenEnds(start2, end2, hitsA2B1, qualityHitsA2B1, numOverlapsA2B1); // quality hits b/w A2 & B1 - FindQualityHitsBetweenEnds(start2, end2, hitsA2B2, qualityHitsA2B2, numOverlapsA2B2); // quality hits b/w A2 & B2 - - - int matchCount1 = 0; - int matchCount2 = 0; - if (_searchType == "neither" || _searchType == "both") { - if ((numOverlapsA1B1 > 0) || (numOverlapsA2B2 > 0)) - FindHitsOnBothEnds(a, qualityHitsA1B1, qualityHitsA2B2, matchCount1); - if ((numOverlapsA1B2 > 0) || (numOverlapsA2B1 > 0)) - FindHitsOnBothEnds(a, qualityHitsA2B1, qualityHitsA1B2, matchCount2); - - // report the fact that no hits were found iff _searchType is neither. - if ((matchCount1 == 0) && (matchCount2 == 0) && (_searchType == "neither")) { - _bedA->reportBedPENewLine(a); - } - } - else if (_searchType == "either") { - FindHitsOnEitherEnd(a, qualityHitsA1B1, qualityHitsA2B2, matchCount1); - FindHitsOnEitherEnd(a, qualityHitsA2B1, qualityHitsA1B2, matchCount2); - } + // Find the _potential_ hits between each end of A and B + _bedB->FindOverlapsPerBin(1, a.chrom1, start1, end1, a.name, a.strand1, hitsA1B1, _overlapFraction, !(_ignoreStrand), _reqDiffNames); // hits b/w A1 & B1 + _bedB->FindOverlapsPerBin(1, a.chrom2, start2, end2, a.name, a.strand2, hitsA2B1, _overlapFraction, !(_ignoreStrand), _reqDiffNames); // hits b/w A2 & B1 + _bedB->FindOverlapsPerBin(2, a.chrom1, start1, end1, a.name, a.strand1, hitsA1B2, _overlapFraction, !(_ignoreStrand), _reqDiffNames); // hits b/w A1 & B2 + _bedB->FindOverlapsPerBin(2, a.chrom2, start2, end2, a.name, a.strand2, hitsA2B2, _overlapFraction, !(_ignoreStrand), _reqDiffNames); // hits b/w A2 & B2 + + unsigned int matchCount1 = (hitsA1B1.size() + hitsA2B2.size()); + unsigned int matchCount2 = (hitsA2B1.size() + hitsA1B2.size()); + + + // report the fact that no hits were found iff _searchType is neither. + if ((matchCount1 == 0) && (matchCount2 == 0) && (_searchType == "neither")) { + _bedA->reportBedPENewLine(a); + } + else if (_searchType == "both") { + if ((hitsA1B1.size() > 0) || (hitsA2B2.size() > 0)) + FindHitsOnBothEnds(a, hitsA1B1, hitsA2B2); + if ((hitsA2B1.size() > 0) || (hitsA1B2.size() > 0)) + FindHitsOnBothEnds(a, hitsA2B1, hitsA1B2); + } + else if (_searchType == "either") { + FindHitsOnEitherEnd(a, hitsA1B1, hitsA2B2); + FindHitsOnEitherEnd(a, hitsA2B1, hitsA1B2); + } } -void PairToPair::FindQualityHitsBetweenEnds(CHRPOS start, CHRPOS end, const vector<MATE> &hits, - vector<MATE> &qualityHits, int &numOverlaps) { +void PairToPair::FindHitsOnBothEnds(const BEDPE &a, const vector<MATE> &qualityHitsEnd1, + const vector<MATE> &qualityHitsEnd2) { - vector<MATE>::const_iterator h = hits.begin(); - vector<MATE>::const_iterator hitsEnd = hits.end(); - for (; h != hitsEnd; ++h) { - int s = max(start, h->bed.start); - int e = min(end, h->bed.end); + map<unsigned int, vector<MATE>, less<int> > hitsMap; - // is there enough overlap (default ~ 1bp) - if ( ((float)(e-s) / (float)(end - start)) >= _overlapFraction ) { - numOverlaps++; - qualityHits.push_back(*h); - } - } -} + for (vector<MATE>::const_iterator h = qualityHitsEnd1.begin(); h != qualityHitsEnd1.end(); ++h) { + hitsMap[h->lineNum].push_back(*h); + } + for (vector<MATE>::const_iterator h = qualityHitsEnd2.begin(); h != qualityHitsEnd2.end(); ++h) { + hitsMap[h->lineNum].push_back(*h); + } + + for (map<unsigned int, vector<MATE>, less<unsigned int> >::iterator m = hitsMap.begin(); m != hitsMap.end(); ++m) { + if (m->second.size() == 2) { + MATE b1 = m->second[0]; + MATE b2 = m->second[1]; -void PairToPair::FindHitsOnBothEnds(const BEDPE &a, const vector<MATE> &qualityHitsEnd1, - const vector<MATE> &qualityHitsEnd2, int &matchCount) { - - map<unsigned int, vector<MATE>, less<int> > hitsMap; - - for (vector<MATE>::const_iterator h = qualityHitsEnd1.begin(); h != qualityHitsEnd1.end(); ++h) { - hitsMap[h->lineNum].push_back(*h); - matchCount++; - } - for (vector<MATE>::const_iterator h = qualityHitsEnd2.begin(); h != qualityHitsEnd2.end(); ++h) { - hitsMap[h->lineNum].push_back(*h); - matchCount++; - } - - for (map<unsigned int, vector<MATE>, less<unsigned int> >::iterator m = hitsMap.begin(); m != hitsMap.end(); ++m) { - if (m->second.size() == 2) { - - MATE b1 = m->second[0]; - MATE b2 = m->second[1]; - - if (_searchType == "both") { - _bedA->reportBedPETab(a); - printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s", b1.bed.chrom.c_str(), b1.bed.start, b1.bed.end, - b2.bed.chrom.c_str(), b2.bed.start, b2.bed.end, - b1.bed.name.c_str(), b1.bed.score.c_str(), - b1.bed.strand.c_str(), b2.bed.strand.c_str()); - for (size_t i = 0; i < b1.bed.otherFields.size(); ++i) + if (_searchType == "both") { + _bedA->reportBedPETab(a); + printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s", b1.bed.chrom.c_str(), b1.bed.start, b1.bed.end, + b2.bed.chrom.c_str(), b2.bed.start, b2.bed.end, + b1.bed.name.c_str(), b1.bed.score.c_str(), + b1.bed.strand.c_str(), b2.bed.strand.c_str()); + for (size_t i = 0; i < b1.bed.otherFields.size(); ++i) printf("\t%s", b1.bed.otherFields[i].c_str()); printf("\n"); - } - } - } + } + } + } } -void PairToPair::FindHitsOnEitherEnd(const BEDPE &a, const vector<MATE> &qualityHitsEnd1, - const vector<MATE> &qualityHitsEnd2, int &matchCount) { - - map<unsigned int, vector<MATE>, less<int> > hitsMap; - - for (vector<MATE>::const_iterator h = qualityHitsEnd1.begin(); h != qualityHitsEnd1.end(); ++h) { - hitsMap[h->lineNum].push_back(*h); - matchCount++; - } - for (vector<MATE>::const_iterator h = qualityHitsEnd2.begin(); h != qualityHitsEnd2.end(); ++h) { - hitsMap[h->lineNum].push_back(*h); - matchCount++; - } - - for (map<unsigned int, vector<MATE>, less<unsigned int> >::iterator m = hitsMap.begin(); m != hitsMap.end(); ++m) { - if (m->second.size() >= 1) { - - if ((m->second.size()) == 2) { - MATE b1 = m->second[0]; - MATE b2 = m->second[1]; - - _bedA->reportBedPETab(a); - printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s", b1.bed.chrom.c_str(), b1.bed.start, b1.bed.end, - b2.bed.chrom.c_str(), b2.bed.start, b2.bed.end, - b1.bed.name.c_str(), b1.bed.score.c_str(), +void PairToPair::FindHitsOnEitherEnd(const BEDPE &a, const vector<MATE> &qualityHitsEnd1, + const vector<MATE> &qualityHitsEnd2) { + + map<unsigned int, vector<MATE>, less<int> > hitsMap; + + for (vector<MATE>::const_iterator h = qualityHitsEnd1.begin(); h != qualityHitsEnd1.end(); ++h) { + hitsMap[h->lineNum].push_back(*h); + } + for (vector<MATE>::const_iterator h = qualityHitsEnd2.begin(); h != qualityHitsEnd2.end(); ++h) { + hitsMap[h->lineNum].push_back(*h); + } + + for (map<unsigned int, vector<MATE>, less<unsigned int> >::iterator m = hitsMap.begin(); m != hitsMap.end(); ++m) { + if (m->second.size() >= 1) { + + if ((m->second.size()) == 2) { + MATE b1 = m->second[0]; + MATE b2 = m->second[1]; + + _bedA->reportBedPETab(a); + printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s", b1.bed.chrom.c_str(), b1.bed.start, b1.bed.end, + b2.bed.chrom.c_str(), b2.bed.start, b2.bed.end, + b1.bed.name.c_str(), b1.bed.score.c_str(), b1.bed.strand.c_str(), b2.bed.strand.c_str()); for (size_t i = 0; i < b1.bed.otherFields.size(); ++i) - printf("\t%s", b1.bed.otherFields[i].c_str()); + printf("\t%s", b1.bed.otherFields[i].c_str()); printf("\n"); - } - else { - MATE b1 = m->second[0]; - - _bedA->reportBedPETab(a); - printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s", b1.bed.chrom.c_str(), b1.bed.start, b1.bed.end, - b1.mate->bed.chrom.c_str(), b1.mate->bed.start, b1.mate->bed.end, - b1.bed.name.c_str(), b1.bed.score.c_str(), + } + else { + MATE b1 = m->second[0]; + + _bedA->reportBedPETab(a); + printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s", b1.bed.chrom.c_str(), b1.bed.start, b1.bed.end, + b1.mate->bed.chrom.c_str(), b1.mate->bed.start, b1.mate->bed.end, + b1.bed.name.c_str(), b1.bed.score.c_str(), b1.bed.strand.c_str(), b1.mate->bed.strand.c_str()); for (size_t i = 0; i < b1.bed.otherFields.size(); ++i) - printf("\t%s", b1.bed.otherFields[i].c_str()); - printf("\n"); + printf("\t%s", b1.bed.otherFields[i].c_str()); + printf("\n"); } - } - } + } + } } diff --git a/src/pairToPair/pairToPair.h b/src/pairToPair/pairToPair.h index e30e4a282cacfa81539350913023c88321b8ac52..d69c57242a2eb5d0aa8fb356ea36d0262cf8137a 100644 --- a/src/pairToPair/pairToPair.h +++ b/src/pairToPair/pairToPair.h @@ -29,46 +29,48 @@ class PairToPair { public: - // constructor - PairToPair(string &bedAFilePE, string &bedBFilePE, float &overlapFraction, - string searchType, bool ignoreStrand, int slop, bool strandedSlop); + // constructor + PairToPair(string &bedAFilePE, string &bedBFilePE, float &overlapFraction, + string searchType, bool ignoreStrand, bool reqDiffNames, int slop, bool strandedSlop); - // destructor - ~PairToPair(void); + // destructor + ~PairToPair(void); + + void IntersectPairs(); - void IntersectPairs(); - private: - string _bedAFilePE; - string _bedBFilePE; - - float _overlapFraction; - string _searchType; - bool _ignoreStrand; + string _bedAFilePE; + string _bedBFilePE; + + float _overlapFraction; + string _searchType; + bool _ignoreStrand; + bool _reqDiffNames; int _slop; bool _strandedSlop; - // instance of a paired-end bed file class. - BedFilePE *_bedA; + // instance of a paired-end bed file class. + BedFilePE *_bedA; + + // instance of a bed file class. + BedFilePE *_bedB; - // instance of a bed file class. - BedFilePE *_bedB; - - // methods - void FindOverlaps(const BEDPE &a, vector<MATE> &hitsA1B1, vector<MATE> &hitsA1B2, - vector<MATE> &hitsA2B1, vector<MATE> &hitsA2B2); + // methods + // void FindOverlaps(const BEDPE &a, vector<MATE> &hitsA1B1, vector<MATE> &hitsA1B2, + // vector<MATE> &hitsA2B1, vector<MATE> &hitsA2B2); + void FindOverlaps(const BEDPE &a); void FindQualityHitsBetweenEnds(CHRPOS start, CHRPOS end, const vector<MATE> &hits, vector<MATE> &qualityHits, int &numOverlaps); - - void FindHitsOnBothEnds(const BEDPE &a, const vector<MATE> &qualityHitsEnd1, - const vector<MATE> &qualityHitsEnd2, int &matchCount); - - void FindHitsOnEitherEnd(const BEDPE &a, const vector<MATE> &qualityHitsEnd1, - const vector<MATE> &qualityHitsEnd2, int &matchCount); - + + void FindHitsOnBothEnds(const BEDPE &a, const vector<MATE> &qualityHitsEnd1, + const vector<MATE> &qualityHitsEnd2); + + void FindHitsOnEitherEnd(const BEDPE &a, const vector<MATE> &qualityHitsEnd1, + const vector<MATE> &qualityHitsEnd2); + }; #endif /* PAIRTOPAIR_H */ diff --git a/src/pairToPair/pairToPairMain.cpp b/src/pairToPair/pairToPairMain.cpp index 1cf03f22bdc828aa487c6b72c3634990872dbf70..97115f68f7b87f6b19a655e781bc8c8dae720395 100644 --- a/src/pairToPair/pairToPairMain.cpp +++ b/src/pairToPair/pairToPairMain.cpp @@ -25,155 +25,164 @@ void ShowHelp(void); int main(int argc, char* argv[]) { - // our configuration variables - bool showHelp = false; - - // input files - string bedAFile; - string bedBFile; - - // input arguments - float overlapFraction = 1E-9; + // our configuration variables + bool showHelp = false; + + // input files + string bedAFile; + string bedBFile; + + // input arguments + float overlapFraction = 1E-9; int slop = 0; - string searchType = "both"; - - // flags to track parameters - bool haveBedA = false; - bool haveBedB = false; - bool haveSearchType = false; - bool haveFraction = false; - bool ignoreStrand = false; - bool haveSlop = false; + string searchType = "both"; + + // flags to track parameters + bool haveBedA = false; + bool haveBedB = false; + bool haveSearchType = false; + bool haveFraction = false; + bool ignoreStrand = false; + bool requireDifferentNames = false; + bool haveSlop = false; bool strandedSlop = false; - // check to see if we should print out some help - if(argc <= 1) showHelp = true; - - for(int i = 1; i < argc; i++) { - int parameterLength = (int)strlen(argv[i]); - - if((PARAMETER_CHECK("-h", 2, parameterLength)) || - (PARAMETER_CHECK("--help", 5, parameterLength))) { - showHelp = true; - } - } - - if(showHelp) ShowHelp(); - - // do some parsing (all of these parameters require 2 strings) - for(int i = 1; i < argc; i++) { - - int parameterLength = (int)strlen(argv[i]); - - if(PARAMETER_CHECK("-a", 2, parameterLength)) { - if ((i+1) < argc) { - haveBedA = true; - bedAFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-b", 2, parameterLength)) { - if ((i+1) < argc) { - haveBedB = true; - bedBFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-type", 5, parameterLength)) { - if ((i+1) < argc) { - haveSearchType = true; - searchType = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-f", 2, parameterLength)) { - if ((i+1) < argc) { - haveFraction = true; - overlapFraction = atof(argv[i + 1]); - i++; - } - } - else if(PARAMETER_CHECK("-slop", 5, parameterLength)) { - if ((i+1) < argc) { - haveSlop = true; - slop = atoi(argv[i + 1]); - i++; - } - } - else if(PARAMETER_CHECK("-ss", 3, parameterLength)) { + // check to see if we should print out some help + if(argc <= 1) showHelp = true; + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if((PARAMETER_CHECK("-h", 2, parameterLength)) || + (PARAMETER_CHECK("--help", 5, parameterLength))) { + showHelp = true; + } + } + + if(showHelp) ShowHelp(); + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-a", 2, parameterLength)) { + if ((i+1) < argc) { + haveBedA = true; + bedAFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-b", 2, parameterLength)) { + if ((i+1) < argc) { + haveBedB = true; + bedBFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-type", 5, parameterLength)) { + if ((i+1) < argc) { + haveSearchType = true; + searchType = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-f", 2, parameterLength)) { + if ((i+1) < argc) { + haveFraction = true; + overlapFraction = atof(argv[i + 1]); + i++; + } + } + else if(PARAMETER_CHECK("-slop", 5, parameterLength)) { + if ((i+1) < argc) { + haveSlop = true; + slop = atoi(argv[i + 1]); + i++; + } + } + else if(PARAMETER_CHECK("-ss", 3, parameterLength)) { strandedSlop = true; - } - else if(PARAMETER_CHECK("-is", 3, parameterLength)) { - ignoreStrand = true; - } - else { - cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; - showHelp = true; - } - } - - - // make sure we have both input files - if (!haveBedA || !haveBedB) { - cerr << endl << "*****" << endl << "*****ERROR: Need -a and -b files. " << endl << "*****" << endl; - showHelp = true; - } - - if (haveSearchType && (searchType != "neither") && (searchType != "both") && (searchType != "either")) { - cerr << endl << "*****" << endl << "*****ERROR: Request \"both\" or \"neither\"" << endl << "*****" << endl; - showHelp = true; - } - - if (strandedSlop == true && haveSlop == false) { - cerr << endl << "*****" << endl << "*****ERROR: Need a -slop value if requesting -ss." << endl << "*****" << endl; - showHelp = true; - } - - if (!showHelp) { - - PairToPair *bi = new PairToPair(bedAFile, bedBFile, overlapFraction, searchType, ignoreStrand, slop, strandedSlop); - delete bi; - return 0; - } - else { - ShowHelp(); - } + } + else if(PARAMETER_CHECK("-rdn", 4, parameterLength)) { + requireDifferentNames = true; + } + else if(PARAMETER_CHECK("-is", 3, parameterLength)) { + ignoreStrand = true; + } + else { + cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; + showHelp = true; + } + } + + + // make sure we have both input files + if (!haveBedA || !haveBedB) { + cerr << endl << "*****" << endl << "*****ERROR: Need -a and -b files. " << endl << "*****" << endl; + showHelp = true; + } + + if (haveSearchType && (searchType != "neither") && (searchType != "both") && (searchType != "either")) { + cerr << endl << "*****" << endl << "*****ERROR: Request \"both\" or \"neither\"" << endl << "*****" << endl; + showHelp = true; + } + + if (strandedSlop == true && haveSlop == false) { + cerr << endl << "*****" << endl << "*****ERROR: Need a -slop value if requesting -ss." << endl << "*****" << endl; + showHelp = true; + } + + if (!showHelp) { + + PairToPair *bi = new PairToPair(bedAFile, bedBFile, overlapFraction, searchType, + ignoreStrand, requireDifferentNames, slop, strandedSlop); + delete bi; + return 0; + } + else { + ShowHelp(); + } } void ShowHelp(void) { - cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; - - cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; - - cerr << "Summary: Report overlaps between two paired-end BED files (BEDPE)." << endl << endl; + cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; + + cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; - cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <BEDPE> -b <BEDPE>" << endl << endl; + cerr << "Summary: Report overlaps between two paired-end BED files (BEDPE)." << endl << endl; - cerr << "Options: " << endl; - cerr << "\t-f\t" << "Minimum overlap required as fraction of A (e.g. 0.05)." << endl; - cerr << "\t\tDefault is 1E-9 (effectively 1bp)." << endl << endl; + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <BEDPE> -b <BEDPE>" << endl << endl; - cerr << "\t-type \t" << "Approach to reporting overlaps between A and B." << endl << endl; - cerr << "\t\tneither\tReport overlaps if neither end of A overlaps B." << endl; + cerr << "Options: " << endl; + cerr << "\t-f\t" << "Minimum overlap required as fraction of A (e.g. 0.05)." << endl; + cerr << "\t\tDefault is 1E-9 (effectively 1bp)." << endl << endl; - cerr << "\t\teither\tReport overlaps if either ends of A overlap B." << endl; + cerr << "\t-type \t" << "Approach to reporting overlaps between A and B." << endl << endl; + cerr << "\t\tneither\tReport overlaps if neither end of A overlaps B." << endl; - cerr << "\t\tboth\tReport overlaps if both ends of A overlap B." << endl; - cerr << "\t\t\t- Default = both." << endl << endl; + cerr << "\t\teither\tReport overlaps if either ends of A overlap B." << endl; - cerr << "\t-slop \t" << "The amount of slop (in b.p.). to be added to each footprint." << endl; + cerr << "\t\tboth\tReport overlaps if both ends of A overlap B." << endl; + cerr << "\t\t\t- Default = both." << endl << endl; + + cerr << "\t-slop \t" << "The amount of slop (in b.p.). to be added to each footprint." << endl; cerr << "\t\t*Note*: Slop is subtracted from start1 and start2 and added to end1 and end2." << endl << endl; - - cerr << "\t-ss\t" << "Add slop based to each BEDPE footprint based on strand." << endl; - cerr << "\t\t- If strand is \"+\", slop is only added to the end coordinates." << endl; - cerr << "\t\t- If strand is \"-\", slop is only added to the start coordinates." << endl; - cerr << "\t\t- By default, slop is added in both directions." << endl << endl; - - cerr << "\t-is\t" << "Ignore strands when searching for overlaps." << endl; - cerr << "\t\t- By default, strands are enforced." << endl << endl; - - cerr << "Refer to the BEDTools manual for BEDPE format." << endl << endl; - - // end the program here - exit(1); + + cerr << "\t-ss\t" << "Add slop based to each BEDPE footprint based on strand." << endl; + cerr << "\t\t- If strand is \"+\", slop is only added to the end coordinates." << endl; + cerr << "\t\t- If strand is \"-\", slop is only added to the start coordinates." << endl; + cerr << "\t\t- By default, slop is added in both directions." << endl << endl; + + cerr << "\t-is\t" << "Ignore strands when searching for overlaps." << endl; + cerr << "\t\t- By default, strands are enforced." << endl << endl; + + cerr << "\t-rdn\t" << "Require the hits to have different names (i.e. avoid self-hits)." << endl; + cerr << "\t\t- By default, same names are allowed." << endl << endl; + + + cerr << "Refer to the BEDTools manual for BEDPE format." << endl << endl; + + // end the program here + exit(1); } diff --git a/src/shuffleBed/shuffleBed.cpp b/src/shuffleBed/shuffleBed.cpp index 9e0fdded98a6ef81ef9606374b7b0fa001cf2ed1..5306f9f45ee8fc7f41ec014f0822a5d5b292abb1 100644 --- a/src/shuffleBed/shuffleBed.cpp +++ b/src/shuffleBed/shuffleBed.cpp @@ -15,47 +15,47 @@ BedShuffle::BedShuffle(string &bedFile, string &genomeFile, string &excludeFile, bool &haveSeed, bool &haveExclude, bool &sameChrom, int &seed) { - _bedFile = bedFile; - _genomeFile = genomeFile; - _excludeFile = excludeFile; - _sameChrom = sameChrom; - _haveExclude = haveExclude; - _haveSeed = haveSeed; - - - // use the supplied seed for the random - // number generation if given. else, - // roll our own. - if (_haveSeed) { - _seed = seed; - srand(seed); - } - else { - srand((unsigned)time(0)); - } - - _bed = new BedFile(bedFile); - _genome = new GenomeFile(genomeFile); - _chroms = _genome->getChromList(); - _numChroms = _genome->getNumberOfChroms(); - - if (_haveExclude) { - _exclude = new BedFile(excludeFile); - _exclude->loadBedFileIntoMap(); - } - - if (_bed->bedFile != "stdin") { // process a file - if (_haveExclude) - ShuffleWithExclusions(); - else - Shuffle(); - } - else { // process stdin - if (_haveExclude) - ShuffleWithExclusions(); - else - Shuffle(); - } + _bedFile = bedFile; + _genomeFile = genomeFile; + _excludeFile = excludeFile; + _sameChrom = sameChrom; + _haveExclude = haveExclude; + _haveSeed = haveSeed; + + + // use the supplied seed for the random + // number generation if given. else, + // roll our own. + if (_haveSeed) { + _seed = seed; + srand(seed); + } + else { + srand((unsigned)time(0)); + } + + _bed = new BedFile(bedFile); + _genome = new GenomeFile(genomeFile); + _chroms = _genome->getChromList(); + _numChroms = _genome->getNumberOfChroms(); + + if (_haveExclude) { + _exclude = new BedFile(excludeFile); + _exclude->loadBedFileIntoMap(); + } + + if (_bed->bedFile != "stdin") { // process a file + if (_haveExclude) + ShuffleWithExclusions(); + else + Shuffle(); + } + else { // process stdin + if (_haveExclude) + ShuffleWithExclusions(); + else + Shuffle(); + } } @@ -66,144 +66,144 @@ BedShuffle::~BedShuffle(void) { void BedShuffle::Shuffle() { - int lineNum = 0; - BED bedEntry, nullBed; // used to store the current BED line from the BED file. - BedLineStatus bedStatus; - - _bed->Open(); - while ((bedStatus = _bed->GetNextBed(bedEntry, lineNum)) != BED_INVALID) { - if (bedStatus == BED_VALID) { - ChooseLocus(bedEntry); - _bed->reportBedNewLine(bedEntry); - bedEntry = nullBed; - } - } - _bed->Close(); + int lineNum = 0; + BED bedEntry, nullBed; // used to store the current BED line from the BED file. + BedLineStatus bedStatus; + + _bed->Open(); + while ((bedStatus = _bed->GetNextBed(bedEntry, lineNum)) != BED_INVALID) { + if (bedStatus == BED_VALID) { + ChooseLocus(bedEntry); + _bed->reportBedNewLine(bedEntry); + bedEntry = nullBed; + } + } + _bed->Close(); } void BedShuffle::ShuffleWithExclusions() { - int lineNum = 0; - BED bedEntry, nullBed; // used to store the current BED line from the BED file. - BedLineStatus bedStatus; - vector<BED> hits; - hits.reserve(100); - - _bed->Open(); - while ((bedStatus = _bed->GetNextBed(bedEntry, lineNum)) != BED_INVALID) { - if (bedStatus == BED_VALID) { - // choose a random locus - ChooseLocus(bedEntry); - - // test to see if the chosen locus overlaps - // with an exclude region - _exclude->FindOverlapsPerBin(bedEntry.chrom, bedEntry.start, bedEntry.end, bedEntry.strand, hits, false); - - bool haveOverlap = false; - vector<BED>::const_iterator hitsItr = hits.begin(); - vector<BED>::const_iterator hitsEnd = hits.end(); - for (; hitsItr != hitsEnd; ++hitsItr) { - - int s = max(bedEntry.start, hitsItr->start); - int e = min(bedEntry.end, hitsItr->end); - - if ( (e - s) > 0) { - haveOverlap = true; - break; /* stop looking. one overlap is enough*/ - } - } - - /* - keep looking as long as the chosen - locus happens to overlap with regions - that the user wishes to exclude. - */ - int tries = 0; - while ((haveOverlap == true) && (tries <= MAX_TRIES)) { - - // choose a new locus - ChooseLocus(bedEntry); - - vector<BED> hits; - _exclude->FindOverlapsPerBin(bedEntry.chrom, bedEntry.start, bedEntry.end, - bedEntry.strand, hits, false); - - haveOverlap = false; - vector<BED>::const_iterator hitsItr = hits.begin(); - vector<BED>::const_iterator hitsEnd = hits.end(); - for (; hitsItr != hitsEnd; ++hitsItr) { - - int s = max(bedEntry.start, hitsItr->start); - int e = min(bedEntry.end, hitsItr->end); - - if ( (e - s) > 0) { - haveOverlap = true; - break; // stop looking. one overlap is enough - } - } - tries++; - } - - if (tries > MAX_TRIES) { - cerr << "Error, line " << lineNum << ": tried " << MAX_TRIES << " potential loci for entry, but could not avoid excluded regions. Ignoring entry and moving on." << endl; - } - else { - _bed->reportBedNewLine(bedEntry); - } - } - bedEntry = nullBed; - } - _bed->Close(); + int lineNum = 0; + BED bedEntry, nullBed; // used to store the current BED line from the BED file. + BedLineStatus bedStatus; + vector<BED> hits; + hits.reserve(100); + + _bed->Open(); + while ((bedStatus = _bed->GetNextBed(bedEntry, lineNum)) != BED_INVALID) { + if (bedStatus == BED_VALID) { + // choose a random locus + ChooseLocus(bedEntry); + + // test to see if the chosen locus overlaps + // with an exclude region + _exclude->FindOverlapsPerBin(bedEntry.chrom, bedEntry.start, bedEntry.end, bedEntry.strand, hits, false); + + bool haveOverlap = false; + vector<BED>::const_iterator hitsItr = hits.begin(); + vector<BED>::const_iterator hitsEnd = hits.end(); + for (; hitsItr != hitsEnd; ++hitsItr) { + + int s = max(bedEntry.start, hitsItr->start); + int e = min(bedEntry.end, hitsItr->end); + + if ( (e - s) > 0) { + haveOverlap = true; + break; /* stop looking. one overlap is enough*/ + } + } + + /* + keep looking as long as the chosen + locus happens to overlap with regions + that the user wishes to exclude. + */ + int tries = 0; + while ((haveOverlap == true) && (tries <= MAX_TRIES)) { + + // choose a new locus + ChooseLocus(bedEntry); + + vector<BED> hits; + _exclude->FindOverlapsPerBin(bedEntry.chrom, bedEntry.start, bedEntry.end, + bedEntry.strand, hits, false); + + haveOverlap = false; + vector<BED>::const_iterator hitsItr = hits.begin(); + vector<BED>::const_iterator hitsEnd = hits.end(); + for (; hitsItr != hitsEnd; ++hitsItr) { + + int s = max(bedEntry.start, hitsItr->start); + int e = min(bedEntry.end, hitsItr->end); + + if ( (e - s) > 0) { + haveOverlap = true; + break; // stop looking. one overlap is enough + } + } + tries++; + } + + if (tries > MAX_TRIES) { + cerr << "Error, line " << lineNum << ": tried " << MAX_TRIES << " potential loci for entry, but could not avoid excluded regions. Ignoring entry and moving on." << endl; + } + else { + _bed->reportBedNewLine(bedEntry); + } + } + bedEntry = nullBed; + } + _bed->Close(); } void BedShuffle::ChooseLocus(BED &bedEntry) { - - string chrom = bedEntry.chrom; - CHRPOS start = bedEntry.start; - CHRPOS end = bedEntry.end; - CHRPOS length = end - start; - - string randomChrom; - CHRPOS randomStart; - CHRPOS chromSize; - - if (_sameChrom == false) { - randomChrom = _chroms[rand() % _numChroms]; - chromSize = _genome->getChromSize(randomChrom); - randomStart = rand() % chromSize; - bedEntry.chrom = randomChrom; - bedEntry.start = randomStart; - bedEntry.end = randomStart + length; - } - else { - chromSize = _genome->getChromSize(chrom); - randomStart = rand() % chromSize; - bedEntry.start = randomStart; - bedEntry.end = randomStart + length; - } - - // ensure that the chosen location doesn't go past - // the length of the chromosome. if so, keep looking - // for a new spot. - while (bedEntry.end > chromSize) { - if (_sameChrom == false) { - randomChrom = _chroms[rand() % _numChroms]; - chromSize = _genome->getChromSize(randomChrom); - randomStart = rand() % chromSize; - bedEntry.chrom = randomChrom; - bedEntry.start = randomStart; - bedEntry.end = randomStart + length; - } - else { - chromSize = _genome->getChromSize(chrom); - randomStart = rand() % chromSize; - bedEntry.start = randomStart; - bedEntry.end = randomStart + length; - } - } + + string chrom = bedEntry.chrom; + CHRPOS start = bedEntry.start; + CHRPOS end = bedEntry.end; + CHRPOS length = end - start; + + string randomChrom; + CHRPOS randomStart; + CHRPOS chromSize; + + if (_sameChrom == false) { + randomChrom = _chroms[rand() % _numChroms]; + chromSize = _genome->getChromSize(randomChrom); + randomStart = rand() % chromSize; + bedEntry.chrom = randomChrom; + bedEntry.start = randomStart; + bedEntry.end = randomStart + length; + } + else { + chromSize = _genome->getChromSize(chrom); + randomStart = rand() % chromSize; + bedEntry.start = randomStart; + bedEntry.end = randomStart + length; + } + + // ensure that the chosen location doesn't go past + // the length of the chromosome. if so, keep looking + // for a new spot. + while (bedEntry.end > chromSize) { + if (_sameChrom == false) { + randomChrom = _chroms[rand() % _numChroms]; + chromSize = _genome->getChromSize(randomChrom); + randomStart = rand() % chromSize; + bedEntry.chrom = randomChrom; + bedEntry.start = randomStart; + bedEntry.end = randomStart + length; + } + else { + chromSize = _genome->getChromSize(chrom); + randomStart = rand() % chromSize; + bedEntry.start = randomStart; + bedEntry.end = randomStart + length; + } + } } diff --git a/src/shuffleBed/shuffleBed.h b/src/shuffleBed/shuffleBed.h index 9f99af46e7cf853d28aaa7daa8aa18609d0d941f..5dde6108476c4ff9cc0e84f8b034823a05ada353 100644 --- a/src/shuffleBed/shuffleBed.h +++ b/src/shuffleBed/shuffleBed.h @@ -30,36 +30,36 @@ class BedShuffle { public: - // constructor - BedShuffle(string &bedFile, string &genomeFile, string &excludeFile, - bool &haveSeed, bool &haveExclude, bool &sameChrom, int &seed); + // constructor + BedShuffle(string &bedFile, string &genomeFile, string &excludeFile, + bool &haveSeed, bool &haveExclude, bool &sameChrom, int &seed); - // destructor - ~BedShuffle(void); + // destructor + ~BedShuffle(void); private: - string _bedFile; - string _genomeFile; - string _excludeFile; - int _seed; - bool _sameChrom; - bool _haveExclude; - bool _haveSeed; - - - // The BED file from which to compute coverage. - BedFile *_bed; - BedFile *_exclude; - - GenomeFile *_genome; - - vector<string> _chroms; - int _numChroms; - - // methods - void Shuffle(); - void ShuffleWithExclusions(); - - void ChooseLocus(BED &); + string _bedFile; + string _genomeFile; + string _excludeFile; + int _seed; + bool _sameChrom; + bool _haveExclude; + bool _haveSeed; + + + // The BED file from which to compute coverage. + BedFile *_bed; + BedFile *_exclude; + + GenomeFile *_genome; + + vector<string> _chroms; + int _numChroms; + + // methods + void Shuffle(); + void ShuffleWithExclusions(); + + void ChooseLocus(BED &); }; diff --git a/src/shuffleBed/shuffleBedMain.cpp b/src/shuffleBed/shuffleBedMain.cpp index a9890329a9d74a59321f7f346c31256ae67fc79e..eabb79346d1cdaf3fa0be9e4215f54f2bb077c8a 100644 --- a/src/shuffleBed/shuffleBedMain.cpp +++ b/src/shuffleBed/shuffleBedMain.cpp @@ -26,129 +26,129 @@ void ShowHelp(void); int main(int argc, char* argv[]) { - // our configuration variables - bool showHelp = false; - - // input files - string bedFile = "stdin"; - string excludeFile; - string genomeFile; - - bool haveBed = true; - bool haveGenome = false; - bool haveExclude = false; - bool haveSeed = false; - int seed = -1; - bool sameChrom = false; - - - for(int i = 1; i < argc; i++) { - int parameterLength = (int)strlen(argv[i]); - - if((PARAMETER_CHECK("-h", 2, parameterLength)) || - (PARAMETER_CHECK("--help", 5, parameterLength))) { - showHelp = true; - } - } - - if(showHelp) ShowHelp(); - - // do some parsing (all of these parameters require 2 strings) - for(int i = 1; i < argc; i++) { - - int parameterLength = (int)strlen(argv[i]); - - if(PARAMETER_CHECK("-i", 2, parameterLength)) { - if ((i+1) < argc) { - bedFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-g", 2, parameterLength)) { - if ((i+1) < argc) { - haveGenome = true; - genomeFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-excl", 5, parameterLength)) { - if ((i+1) < argc) { - haveExclude = true; - excludeFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-seed", 5, parameterLength)) { - if ((i+1) < argc) { - haveSeed = true; - seed = atoi(argv[i + 1]); - i++; - } - } - else if(PARAMETER_CHECK("-chrom", 6, parameterLength)) { - sameChrom = true; - } - else { - cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; - showHelp = true; - } - } - - // make sure we have both input files - if (!haveBed || !haveGenome) { - cerr << endl << "*****" << endl << "*****ERROR: Need both a BED (-i) and a genome (-g) file. " << endl << "*****" << endl; - showHelp = true; - } - - if (!showHelp) { - BedShuffle *bc = new BedShuffle(bedFile, genomeFile, excludeFile, haveSeed, haveExclude, sameChrom, seed); - delete bc; - return 0; - } - else { - ShowHelp(); - } + // our configuration variables + bool showHelp = false; + + // input files + string bedFile = "stdin"; + string excludeFile; + string genomeFile; + + bool haveBed = true; + bool haveGenome = false; + bool haveExclude = false; + bool haveSeed = false; + int seed = -1; + bool sameChrom = false; + + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if((PARAMETER_CHECK("-h", 2, parameterLength)) || + (PARAMETER_CHECK("--help", 5, parameterLength))) { + showHelp = true; + } + } + + if(showHelp) ShowHelp(); + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-i", 2, parameterLength)) { + if ((i+1) < argc) { + bedFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-g", 2, parameterLength)) { + if ((i+1) < argc) { + haveGenome = true; + genomeFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-excl", 5, parameterLength)) { + if ((i+1) < argc) { + haveExclude = true; + excludeFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-seed", 5, parameterLength)) { + if ((i+1) < argc) { + haveSeed = true; + seed = atoi(argv[i + 1]); + i++; + } + } + else if(PARAMETER_CHECK("-chrom", 6, parameterLength)) { + sameChrom = true; + } + else { + cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; + showHelp = true; + } + } + + // make sure we have both input files + if (!haveBed || !haveGenome) { + cerr << endl << "*****" << endl << "*****ERROR: Need both a BED (-i) and a genome (-g) file. " << endl << "*****" << endl; + showHelp = true; + } + + if (!showHelp) { + BedShuffle *bc = new BedShuffle(bedFile, genomeFile, excludeFile, haveSeed, haveExclude, sameChrom, seed); + delete bc; + return 0; + } + else { + ShowHelp(); + } } void ShowHelp(void) { - - cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; - - cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; - - cerr << "Summary: Randomly permute the locations of a feature file among a genome." << endl << endl; - - cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bed/gff/vcf> -g <genome>" << endl << endl; - - cerr << "Options: " << endl; - cerr << "\t-excl\t" << "A BED/GFF/VCF file of coordinates in which features in -i" << endl; - cerr << "\t\tshould not be placed (e.g. gaps.bed)." << endl << endl; - - cerr << "\t-chrom\t" << "Keep features in -i on the same chromosome."<< endl; - cerr << "\t\t- By default, the chrom and position are randomly chosen." << endl << endl; - - cerr << "\t-seed\t" << "Supply an integer seed for the shuffling." << endl; - cerr << "\t\t- By default, the seed is chosen automatically." << endl; - cerr << "\t\t- (INTEGER)" << endl << endl; - - - cerr << "Notes: " << endl; - cerr << "\t(1) The genome file should tab delimited and structured as follows:" << endl; - cerr << "\t <chromName><TAB><chromSize>" << endl << endl; - cerr << "\tFor example, Human (hg19):" << endl; - cerr << "\tchr1\t249250621" << endl; - cerr << "\tchr2\t243199373" << endl; - cerr << "\t..." << endl; - cerr << "\tchr18_gl000207_random\t4262" << endl << endl; - - - cerr << "Tips: " << endl; - cerr << "\tOne can use the UCSC Genome Browser's MySQL database to extract" << endl; - cerr << "\tchromosome sizes. For example, H. sapiens:" << endl << endl; - cerr << "\tmysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e /" << endl; - cerr << "\t\"select chrom, size from hg19.chromInfo\" > hg19.genome" << endl << endl; - - - // end the program here - exit(1); + + cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; + + cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; + + cerr << "Summary: Randomly permute the locations of a feature file among a genome." << endl << endl; + + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bed/gff/vcf> -g <genome>" << endl << endl; + + cerr << "Options: " << endl; + cerr << "\t-excl\t" << "A BED/GFF/VCF file of coordinates in which features in -i" << endl; + cerr << "\t\tshould not be placed (e.g. gaps.bed)." << endl << endl; + + cerr << "\t-chrom\t" << "Keep features in -i on the same chromosome."<< endl; + cerr << "\t\t- By default, the chrom and position are randomly chosen." << endl << endl; + + cerr << "\t-seed\t" << "Supply an integer seed for the shuffling." << endl; + cerr << "\t\t- By default, the seed is chosen automatically." << endl; + cerr << "\t\t- (INTEGER)" << endl << endl; + + + cerr << "Notes: " << endl; + cerr << "\t(1) The genome file should tab delimited and structured as follows:" << endl; + cerr << "\t <chromName><TAB><chromSize>" << endl << endl; + cerr << "\tFor example, Human (hg19):" << endl; + cerr << "\tchr1\t249250621" << endl; + cerr << "\tchr2\t243199373" << endl; + cerr << "\t..." << endl; + cerr << "\tchr18_gl000207_random\t4262" << endl << endl; + + + cerr << "Tips: " << endl; + cerr << "\tOne can use the UCSC Genome Browser's MySQL database to extract" << endl; + cerr << "\tchromosome sizes. For example, H. sapiens:" << endl << endl; + cerr << "\tmysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e /" << endl; + cerr << "\t\"select chrom, size from hg19.chromInfo\" > hg19.genome" << endl << endl; + + + // end the program here + exit(1); } diff --git a/src/slopBed/slopBed.cpp b/src/slopBed/slopBed.cpp index 5d646cd135486fb82a11dd4310f3ad9712b81a24..0b4166ae952e67c8d01aa06d9ef3c7afceb1677f 100644 --- a/src/slopBed/slopBed.cpp +++ b/src/slopBed/slopBed.cpp @@ -15,17 +15,17 @@ BedSlop::BedSlop(string &bedFile, string &genomeFile, bool &forceStrand, int &leftSlop, int &rightSlop) { - _bedFile = bedFile; - _genomeFile = genomeFile; - _forceStrand = forceStrand; - - _leftSlop = leftSlop; - _rightSlop = rightSlop; - - _bed = new BedFile(bedFile); - _genome = new GenomeFile(genomeFile); - - SlopBed(); + _bedFile = bedFile; + _genomeFile = genomeFile; + _forceStrand = forceStrand; + + _leftSlop = leftSlop; + _rightSlop = rightSlop; + + _bed = new BedFile(bedFile); + _genome = new GenomeFile(genomeFile); + + SlopBed(); } @@ -35,49 +35,49 @@ BedSlop::~BedSlop(void) { void BedSlop::SlopBed() { - - int lineNum = 0; - BED bedEntry, nullBed; // used to store the current BED line from the BED file. - BedLineStatus bedStatus; - - _bed->Open(); - bedStatus = _bed->GetNextBed(bedEntry, lineNum); - while (bedStatus != BED_INVALID) { - if (bedStatus == BED_VALID) { - AddSlop(bedEntry); - _bed->reportBedNewLine(bedEntry); - bedEntry = nullBed; - } - bedStatus = _bed->GetNextBed(bedEntry, lineNum); - } - _bed->Close(); + + int lineNum = 0; + BED bedEntry, nullBed; // used to store the current BED line from the BED file. + BedLineStatus bedStatus; + + _bed->Open(); + bedStatus = _bed->GetNextBed(bedEntry, lineNum); + while (bedStatus != BED_INVALID) { + if (bedStatus == BED_VALID) { + AddSlop(bedEntry); + _bed->reportBedNewLine(bedEntry); + bedEntry = nullBed; + } + bedStatus = _bed->GetNextBed(bedEntry, lineNum); + } + _bed->Close(); } void BedSlop::AddSlop(BED &bed) { - // special handling if the BED entry is on the negative - // strand and the user cares about strandedness. - CHRPOS chromSize = _genome->getChromSize(bed.chrom); - - if ( (_forceStrand) && (bed.strand == "-") ) { - // inspect the start - if ( (static_cast<int>(bed.start) - _rightSlop) > 0 ) bed.start -= _rightSlop; - else bed.start = 0; - - // inspect the start - if ( (static_cast<int>(bed.end) + _leftSlop) <= static_cast<int>(chromSize)) bed.end += _leftSlop; - else bed.end = chromSize; - } - else { - // inspect the start - if ( (static_cast<int>(bed.start) - _leftSlop) > 0) bed.start -= _leftSlop; - else bed.start = 0; - - // inspect the end - if ( (static_cast<int>(bed.end) + _rightSlop) <= static_cast<int>(chromSize)) bed.end += _rightSlop; - else bed.end = chromSize; - } + // special handling if the BED entry is on the negative + // strand and the user cares about strandedness. + CHRPOS chromSize = _genome->getChromSize(bed.chrom); + + if ( (_forceStrand) && (bed.strand == "-") ) { + // inspect the start + if ( (static_cast<int>(bed.start) - _rightSlop) > 0 ) bed.start -= _rightSlop; + else bed.start = 0; + + // inspect the start + if ( (static_cast<int>(bed.end) + _leftSlop) <= static_cast<int>(chromSize)) bed.end += _leftSlop; + else bed.end = chromSize; + } + else { + // inspect the start + if ( (static_cast<int>(bed.start) - _leftSlop) > 0) bed.start -= _leftSlop; + else bed.start = 0; + + // inspect the end + if ( (static_cast<int>(bed.end) + _rightSlop) <= static_cast<int>(chromSize)) bed.end += _rightSlop; + else bed.end = chromSize; + } } diff --git a/src/slopBed/slopBed.h b/src/slopBed/slopBed.h index 2e73f3969fd3b4bbb75f9b17174b4898f9674b32..48d36f4f87af8ccaa04a90f118588877a7f470eb 100644 --- a/src/slopBed/slopBed.h +++ b/src/slopBed/slopBed.h @@ -29,30 +29,30 @@ class BedSlop { public: - // constructor - BedSlop(string &bedFile, string &genomeFile, bool &forceStrand, int &leftSlop, int &rightSlop) ; + // constructor + BedSlop(string &bedFile, string &genomeFile, bool &forceStrand, int &leftSlop, int &rightSlop) ; + + // destructor + ~BedSlop(void); - // destructor - ~BedSlop(void); - private: - string _bedFile; - string _genomeFile; - - bool _forceStrand; - int _leftSlop; - int _rightSlop; - - BedFile *_bed; - GenomeFile *_genome; - - // methods - - void SlopBed(); - - // method to add requested "slop" to a single BED entry - void AddSlop(BED &bed); + string _bedFile; + string _genomeFile; + + bool _forceStrand; + int _leftSlop; + int _rightSlop; + + BedFile *_bed; + GenomeFile *_genome; + + // methods + + void SlopBed(); + + // method to add requested "slop" to a single BED entry + void AddSlop(BED &bed); }; diff --git a/src/slopBed/slopBedMain.cpp b/src/slopBed/slopBedMain.cpp index 8319b47bf694b6213a61f2b578a111d3e4eb0384..3e490de07c0aebddc06a3ef6dba2dcc0995c9c97 100644 --- a/src/slopBed/slopBedMain.cpp +++ b/src/slopBed/slopBedMain.cpp @@ -26,157 +26,157 @@ void ShowHelp(void); int main(int argc, char* argv[]) { - // our configuration variables - bool showHelp = false; - - // input files - string bedFile = "stdin"; - string genomeFile; - - bool haveBed = true; - bool haveGenome = false; - bool haveLeft = false; - bool haveRight = false; - bool haveBoth = false; - - bool forceStrand = false; - int leftSlop = 0; - int rightSlop = 0; - - for(int i = 1; i < argc; i++) { - int parameterLength = (int)strlen(argv[i]); - - if((PARAMETER_CHECK("-h", 2, parameterLength)) || - (PARAMETER_CHECK("--help", 5, parameterLength))) { - showHelp = true; - } - } - - if(showHelp) ShowHelp(); - - // do some parsing (all of these parameters require 2 strings) - for(int i = 1; i < argc; i++) { - - int parameterLength = (int)strlen(argv[i]); - - if(PARAMETER_CHECK("-i", 2, parameterLength)) { - if ((i+1) < argc) { - bedFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-g", 2, parameterLength)) { - if ((i+1) < argc) { - haveGenome = true; - genomeFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-l", 2, parameterLength)) { - if ((i+1) < argc) { - haveLeft = true; - leftSlop = atoi(argv[i + 1]); - i++; - } - } - else if(PARAMETER_CHECK("-r", 2, parameterLength)) { - if ((i+1) < argc) { - haveRight = true; - rightSlop = atoi(argv[i + 1]); - i++; - } - } - else if(PARAMETER_CHECK("-b", 2, parameterLength)) { - if ((i+1) < argc) { - haveBoth = true; - leftSlop = atoi(argv[i + 1]); - rightSlop = atoi(argv[i + 1]); - i++; - } - } - else if(PARAMETER_CHECK("-s", 2, parameterLength)) { - forceStrand = true; - } - else { - cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; - showHelp = true; - } - } - - // make sure we have both input files - if (!haveBed || !haveGenome) { - cerr << endl << "*****" << endl << "*****ERROR: Need both a BED (-i) and a genome (-g) file. " << endl << "*****" << endl; - showHelp = true; - } - if (!haveLeft && !haveRight && !haveBoth) { - cerr << endl << "*****" << endl << "*****ERROR: Need -l and -r together or -b alone. " << endl << "*****" << endl; - showHelp = true; - } - if ((!haveLeft && haveRight) || (haveLeft && !haveRight)) { - cerr << endl << "*****" << endl << "*****ERROR: Need both -l and -r. " << endl << "*****" << endl; - showHelp = true; - } - if (forceStrand && (!(haveLeft) || !(haveRight))) { - cerr << endl << "*****" << endl << "*****ERROR: Must supply -l and -r with -s. " << endl << "*****" << endl; - showHelp = true; - } - - if (!showHelp) { - BedSlop *bc = new BedSlop(bedFile, genomeFile, forceStrand, leftSlop, rightSlop); - delete bc; - - return 0; - } - else { - ShowHelp(); - } + // our configuration variables + bool showHelp = false; + + // input files + string bedFile = "stdin"; + string genomeFile; + + bool haveBed = true; + bool haveGenome = false; + bool haveLeft = false; + bool haveRight = false; + bool haveBoth = false; + + bool forceStrand = false; + int leftSlop = 0; + int rightSlop = 0; + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if((PARAMETER_CHECK("-h", 2, parameterLength)) || + (PARAMETER_CHECK("--help", 5, parameterLength))) { + showHelp = true; + } + } + + if(showHelp) ShowHelp(); + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-i", 2, parameterLength)) { + if ((i+1) < argc) { + bedFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-g", 2, parameterLength)) { + if ((i+1) < argc) { + haveGenome = true; + genomeFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-l", 2, parameterLength)) { + if ((i+1) < argc) { + haveLeft = true; + leftSlop = atoi(argv[i + 1]); + i++; + } + } + else if(PARAMETER_CHECK("-r", 2, parameterLength)) { + if ((i+1) < argc) { + haveRight = true; + rightSlop = atoi(argv[i + 1]); + i++; + } + } + else if(PARAMETER_CHECK("-b", 2, parameterLength)) { + if ((i+1) < argc) { + haveBoth = true; + leftSlop = atoi(argv[i + 1]); + rightSlop = atoi(argv[i + 1]); + i++; + } + } + else if(PARAMETER_CHECK("-s", 2, parameterLength)) { + forceStrand = true; + } + else { + cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; + showHelp = true; + } + } + + // make sure we have both input files + if (!haveBed || !haveGenome) { + cerr << endl << "*****" << endl << "*****ERROR: Need both a BED (-i) and a genome (-g) file. " << endl << "*****" << endl; + showHelp = true; + } + if (!haveLeft && !haveRight && !haveBoth) { + cerr << endl << "*****" << endl << "*****ERROR: Need -l and -r together or -b alone. " << endl << "*****" << endl; + showHelp = true; + } + if ((!haveLeft && haveRight) || (haveLeft && !haveRight)) { + cerr << endl << "*****" << endl << "*****ERROR: Need both -l and -r. " << endl << "*****" << endl; + showHelp = true; + } + if (forceStrand && (!(haveLeft) || !(haveRight))) { + cerr << endl << "*****" << endl << "*****ERROR: Must supply -l and -r with -s. " << endl << "*****" << endl; + showHelp = true; + } + + if (!showHelp) { + BedSlop *bc = new BedSlop(bedFile, genomeFile, forceStrand, leftSlop, rightSlop); + delete bc; + + return 0; + } + else { + ShowHelp(); + } } void ShowHelp(void) { - - cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; - - cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; - - cerr << "Summary: Add requested base pairs of \"slop\" to each feature." << endl << endl; - - cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bed/gff/vcf> -g <genome> [-b <int> or (-l and -r)]" << endl << endl; - - cerr << "Options: " << endl; - cerr << "\t-b\t" << "Increase the BED/GFF/VCF entry by -b base pairs in each direction." << endl; - cerr << "\t\t- (Integer)" << endl; - - cerr << "\t-l\t" << "The number of base pairs to subtract from the start coordinate." << endl; - cerr << "\t\t- (Integer)" << endl; - - cerr << "\t-r\t" << "The number of base pairs to add to the end coordinate." << endl; - cerr << "\t\t- (Integer)" << endl; - - cerr << "\t-s\t" << "Define -l and -r based on strand." << endl; - cerr << "\t\tE.g. if used, -l 500 for a negative-stranded feature, " << endl; - cerr << "\t\tit will add 500 bp downstream. Default = false." << endl << endl; - - cerr << "Notes: " << endl; - cerr << "\t(1) Starts will be set to 0 if options would force it below 0." << endl; - cerr << "\t(2) Ends will be set to the chromosome length if requested slop would" << endl; - cerr << "\tforce it above the max chrom length." << endl; - - cerr << "\t(3) The genome file should tab delimited and structured as follows:" << endl; - cerr << "\n\t<chromName><TAB><chromSize>" << endl << endl; - cerr << "\tFor example, Human (hg19):" << endl; - cerr << "\tchr1\t249250621" << endl; - cerr << "\tchr2\t243199373" << endl; - cerr << "\t..." << endl; - cerr << "\tchr18_gl000207_random\t4262" << endl << endl; - - - cerr << "Tips: " << endl; - cerr << "\tOne can use the UCSC Genome Browser's MySQL database to extract" << endl; - cerr << "\tchromosome sizes. For example, H. sapiens:" << endl << endl; - cerr << "\tmysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e /" << endl; - cerr << "\t\"select chrom, size from hg19.chromInfo\" > hg19.genome" << endl << endl; - - - // end the program here - exit(1); + + cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; + + cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; + + cerr << "Summary: Add requested base pairs of \"slop\" to each feature." << endl << endl; + + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bed/gff/vcf> -g <genome> [-b <int> or (-l and -r)]" << endl << endl; + + cerr << "Options: " << endl; + cerr << "\t-b\t" << "Increase the BED/GFF/VCF entry by -b base pairs in each direction." << endl; + cerr << "\t\t- (Integer)" << endl; + + cerr << "\t-l\t" << "The number of base pairs to subtract from the start coordinate." << endl; + cerr << "\t\t- (Integer)" << endl; + + cerr << "\t-r\t" << "The number of base pairs to add to the end coordinate." << endl; + cerr << "\t\t- (Integer)" << endl; + + cerr << "\t-s\t" << "Define -l and -r based on strand." << endl; + cerr << "\t\tE.g. if used, -l 500 for a negative-stranded feature, " << endl; + cerr << "\t\tit will add 500 bp downstream. Default = false." << endl << endl; + + cerr << "Notes: " << endl; + cerr << "\t(1) Starts will be set to 0 if options would force it below 0." << endl; + cerr << "\t(2) Ends will be set to the chromosome length if requested slop would" << endl; + cerr << "\tforce it above the max chrom length." << endl; + + cerr << "\t(3) The genome file should tab delimited and structured as follows:" << endl; + cerr << "\n\t<chromName><TAB><chromSize>" << endl << endl; + cerr << "\tFor example, Human (hg19):" << endl; + cerr << "\tchr1\t249250621" << endl; + cerr << "\tchr2\t243199373" << endl; + cerr << "\t..." << endl; + cerr << "\tchr18_gl000207_random\t4262" << endl << endl; + + + cerr << "Tips: " << endl; + cerr << "\tOne can use the UCSC Genome Browser's MySQL database to extract" << endl; + cerr << "\tchromosome sizes. For example, H. sapiens:" << endl << endl; + cerr << "\tmysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e /" << endl; + cerr << "\t\"select chrom, size from hg19.chromInfo\" > hg19.genome" << endl << endl; + + + // end the program here + exit(1); } diff --git a/src/sortBed/sortBed.cpp b/src/sortBed/sortBed.cpp index 89641be0d19a7fb4efd55b3079a63d991727e4b5..f76704642c7c0571c16596b66ae6a5e38154026e 100644 --- a/src/sortBed/sortBed.cpp +++ b/src/sortBed/sortBed.cpp @@ -16,8 +16,8 @@ // Constructor // BedSort::BedSort(string &bedFile) { - _bedFile = bedFile; - _bed = new BedFile(bedFile); + _bedFile = bedFile; + _bed = new BedFile(bedFile); } // @@ -29,173 +29,173 @@ BedSort::~BedSort(void) { void BedSort::SortBed() { - // load the "B" bed file into a map so - // that we can easily compare "A" to it for overlaps - _bed->loadBedFileIntoMapNoBin(); + // load the "B" bed file into a map so + // that we can easily compare "A" to it for overlaps + _bed->loadBedFileIntoMapNoBin(); - // loop through each chromosome and merge their BED entries - for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) { + // loop through each chromosome and merge their BED entries + for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) { - // bedList is already sorted by start position. - vector<BED> bedList = m->second; + // bedList is already sorted by start position. + vector<BED> bedList = m->second; - for (unsigned int i = 0; i < bedList.size(); ++i) { - _bed->reportBedNewLine(bedList[i]); - } - } + for (unsigned int i = 0; i < bedList.size(); ++i) { + _bed->reportBedNewLine(bedList[i]); + } + } } void BedSort::SortBedBySizeAsc() { - // load the "B" bed file into a map so - // that we can easily compare "A" to it for overlaps - _bed->loadBedFileIntoMapNoBin(); - - vector<BED> masterList; - masterList.reserve(1000000); - - // loop through each chromosome and merge their BED entries - for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) { - - // bedList is already sorted by start position. - vector<BED> bedList = m->second; - - // add the entries from this chromosome to the current list - for (unsigned int i = 0; i < m->second.size(); ++i) { - masterList.push_back(m->second[i]); - } - } - - // sort the master list by size (asc.) - sort(masterList.begin(), masterList.end(), sortBySizeAsc); - - // report the entries in ascending order - for (unsigned int i = 0; i < masterList.size(); ++i) { - _bed->reportBedNewLine(masterList[i]); - } + // load the "B" bed file into a map so + // that we can easily compare "A" to it for overlaps + _bed->loadBedFileIntoMapNoBin(); + + vector<BED> masterList; + masterList.reserve(1000000); + + // loop through each chromosome and merge their BED entries + for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) { + + // bedList is already sorted by start position. + vector<BED> bedList = m->second; + + // add the entries from this chromosome to the current list + for (unsigned int i = 0; i < m->second.size(); ++i) { + masterList.push_back(m->second[i]); + } + } + + // sort the master list by size (asc.) + sort(masterList.begin(), masterList.end(), sortBySizeAsc); + + // report the entries in ascending order + for (unsigned int i = 0; i < masterList.size(); ++i) { + _bed->reportBedNewLine(masterList[i]); + } } void BedSort::SortBedBySizeDesc() { - // load the "B" bed file into a map so - // that we can easily compare "A" to it for overlaps - _bed->loadBedFileIntoMapNoBin(); - - vector<BED> masterList; - masterList.reserve(1000000); - - // loop through each chromosome and merge their BED entries - for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) { - - // bedList is already sorted by start position. - vector<BED> bedList = m->second; - - // add the entries from this chromosome to the current list - for (unsigned int i = 0; i < m->second.size(); ++i) { - masterList.push_back(m->second[i]); - } - } - - // sort the master list by size (asc.) - sort(masterList.begin(), masterList.end(), sortBySizeDesc); - - // report the entries in ascending order - for (unsigned int i = 0; i < masterList.size(); ++i) { - _bed->reportBedNewLine(masterList[i]); - } + // load the "B" bed file into a map so + // that we can easily compare "A" to it for overlaps + _bed->loadBedFileIntoMapNoBin(); + + vector<BED> masterList; + masterList.reserve(1000000); + + // loop through each chromosome and merge their BED entries + for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) { + + // bedList is already sorted by start position. + vector<BED> bedList = m->second; + + // add the entries from this chromosome to the current list + for (unsigned int i = 0; i < m->second.size(); ++i) { + masterList.push_back(m->second[i]); + } + } + + // sort the master list by size (asc.) + sort(masterList.begin(), masterList.end(), sortBySizeDesc); + + // report the entries in ascending order + for (unsigned int i = 0; i < masterList.size(); ++i) { + _bed->reportBedNewLine(masterList[i]); + } } void BedSort::SortBedByChromThenSizeAsc() { - // load the "B" bed file into a map so - // that we can easily compare "A" to it for overlaps - _bed->loadBedFileIntoMapNoBin(); - - // loop through each chromosome and merge their BED entries - for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) { - - // bedList is already sorted by start position. - vector<BED> bedList = m->second; - sort(bedList.begin(), bedList.end(), sortBySizeAsc); - - for (unsigned int i = 0; i < bedList.size(); ++i) { - _bed->reportBedNewLine(bedList[i]); - } - } + // load the "B" bed file into a map so + // that we can easily compare "A" to it for overlaps + _bed->loadBedFileIntoMapNoBin(); + + // loop through each chromosome and merge their BED entries + for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) { + + // bedList is already sorted by start position. + vector<BED> bedList = m->second; + sort(bedList.begin(), bedList.end(), sortBySizeAsc); + + for (unsigned int i = 0; i < bedList.size(); ++i) { + _bed->reportBedNewLine(bedList[i]); + } + } } void BedSort::SortBedByChromThenSizeDesc() { - // load the "B" bed file into a map so - // that we can easily compare "A" to it for overlaps - _bed->loadBedFileIntoMapNoBin(); + // load the "B" bed file into a map so + // that we can easily compare "A" to it for overlaps + _bed->loadBedFileIntoMapNoBin(); + + // loop through each chromosome and merge their BED entries + for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) { - // loop through each chromosome and merge their BED entries - for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) { + // bedList is already sorted by start position. + vector<BED> bedList = m->second; - // bedList is already sorted by start position. - vector<BED> bedList = m->second; + sort(bedList.begin(), bedList.end(), sortBySizeDesc); - sort(bedList.begin(), bedList.end(), sortBySizeDesc); - - for (unsigned int i = 0; i < bedList.size(); ++i) { - _bed->reportBedNewLine(bedList[i]); - } - } + for (unsigned int i = 0; i < bedList.size(); ++i) { + _bed->reportBedNewLine(bedList[i]); + } + } } void BedSort::SortBedByChromThenScoreAsc() { - // load the "B" bed file into a map so - // that we can easily compare "A" to it for overlaps - _bed->loadBedFileIntoMapNoBin(); - - if (_bed->bedType >= 5) { - // loop through each chromosome and merge their BED entries - for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) { - - // bedList is already sorted by start position. - vector<BED> bedList = m->second; - sort(bedList.begin(), bedList.end(), sortByScoreAsc); - - for (unsigned int i = 0; i < bedList.size(); ++i) { - _bed->reportBedNewLine(bedList[i]); - } - } - } - else { - cerr << "Error: Requested a sort by score, but your BED file does not appear to be in BED 5 format or greater. Exiting." << endl; - exit(1); - } + // load the "B" bed file into a map so + // that we can easily compare "A" to it for overlaps + _bed->loadBedFileIntoMapNoBin(); + + if (_bed->bedType >= 5) { + // loop through each chromosome and merge their BED entries + for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) { + + // bedList is already sorted by start position. + vector<BED> bedList = m->second; + sort(bedList.begin(), bedList.end(), sortByScoreAsc); + + for (unsigned int i = 0; i < bedList.size(); ++i) { + _bed->reportBedNewLine(bedList[i]); + } + } + } + else { + cerr << "Error: Requested a sort by score, but your BED file does not appear to be in BED 5 format or greater. Exiting." << endl; + exit(1); + } } void BedSort::SortBedByChromThenScoreDesc() { - // load the "B" bed file into a map so - // that we can easily compare "A" to it for overlaps - _bed->loadBedFileIntoMapNoBin(); - - if (_bed->bedType >= 5) { - // loop through each chromosome and merge their BED entries - for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) { - - // bedList is already sorted by start position. - vector<BED> bedList = m->second; - sort(bedList.begin(), bedList.end(), sortByScoreDesc); - - for (unsigned int i = 0; i < bedList.size(); ++i) { - _bed->reportBedNewLine(bedList[i]); - } - } - } - else { - cerr << "Error: Requested a sort by score, but your BED file does not appear to be in BED 5 format or greater. Exiting." << endl; - exit(1); - } + // load the "B" bed file into a map so + // that we can easily compare "A" to it for overlaps + _bed->loadBedFileIntoMapNoBin(); + + if (_bed->bedType >= 5) { + // loop through each chromosome and merge their BED entries + for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) { + + // bedList is already sorted by start position. + vector<BED> bedList = m->second; + sort(bedList.begin(), bedList.end(), sortByScoreDesc); + + for (unsigned int i = 0; i < bedList.size(); ++i) { + _bed->reportBedNewLine(bedList[i]); + } + } + } + else { + cerr << "Error: Requested a sort by score, but your BED file does not appear to be in BED 5 format or greater. Exiting." << endl; + exit(1); + } } diff --git a/src/sortBed/sortBed.h b/src/sortBed/sortBed.h index 350c9ae376db2804c4354c0a9005639589ae46b1..11e7e90a5e0d32c582f7e48d9a946a05e7619522 100644 --- a/src/sortBed/sortBed.h +++ b/src/sortBed/sortBed.h @@ -25,26 +25,26 @@ class BedSort { public: - // constructor - BedSort(string &); - - // destructor - ~BedSort(void); - - void SortBed(); // the default. sorts by chrom (asc.) then by start (asc.) - void SortBedBySizeAsc(); - void SortBedBySizeDesc(); - void SortBedByChromThenSizeAsc(); - void SortBedByChromThenSizeDesc(); - void SortBedByChromThenScoreAsc(); - void SortBedByChromThenScoreDesc(); - -private: - string _bedFile; - - // instance of a bed file class. - BedFile *_bed; - - // methods + // constructor + BedSort(string &); + + // destructor + ~BedSort(void); + + void SortBed(); // the default. sorts by chrom (asc.) then by start (asc.) + void SortBedBySizeAsc(); + void SortBedBySizeDesc(); + void SortBedByChromThenSizeAsc(); + void SortBedByChromThenSizeDesc(); + void SortBedByChromThenScoreAsc(); + void SortBedByChromThenScoreDesc(); + +private: + string _bedFile; + + // instance of a bed file class. + BedFile *_bed; + + // methods }; diff --git a/src/sortBed/sortMain.cpp b/src/sortBed/sortMain.cpp index 304ec53040301d0508c5eb2a642120d37b3e1858..c5c19ab89d19dc1c5d106538961e2b062d0cc563 100644 --- a/src/sortBed/sortMain.cpp +++ b/src/sortBed/sortMain.cpp @@ -26,132 +26,132 @@ void ShowHelp(void); int main(int argc, char* argv[]) { - // our configuration variables - bool showHelp = false; - - // input files - string bedFile = "stdin"; - bool haveBed = true; - int sortChoices = 0; - - bool sortBySizeAsc = false; - bool sortBySizeDesc = false; - bool sortByChromThenSizeAsc = false; - bool sortByChromThenSizeDesc = false; - bool sortByChromThenScoreAsc = false; - bool sortByChromThenScoreDesc = false; - - - for(int i = 1; i < argc; i++) { - int parameterLength = (int)strlen(argv[i]); - - if((PARAMETER_CHECK("-h", 2, parameterLength)) || - (PARAMETER_CHECK("--help", 5, parameterLength))) { - showHelp = true; - } - } - - if(showHelp) ShowHelp(); - - // do some parsing (all of these parameters require 2 strings) - for(int i = 1; i < argc; i++) { - - int parameterLength = (int)strlen(argv[i]); - - if(PARAMETER_CHECK("-i", 2, parameterLength)) { - if ((i+1) < argc) { - bedFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-sizeA", 6, parameterLength)) { - sortBySizeAsc = true; - sortChoices++; - } - else if(PARAMETER_CHECK("-sizeD", 6, parameterLength)) { - sortBySizeDesc = true; - sortChoices++; - } - else if(PARAMETER_CHECK("-chrThenSizeA", 13, parameterLength)) { - sortByChromThenSizeAsc = true; - sortChoices++; - } - else if(PARAMETER_CHECK("-chrThenSizeD", 13, parameterLength)) { - sortByChromThenSizeDesc = true; - sortChoices++; - } - else if(PARAMETER_CHECK("-chrThenScoreA", 14, parameterLength)) { - sortByChromThenScoreAsc = true; - sortChoices++; - } - else if(PARAMETER_CHECK("-chrThenScoreD", 14, parameterLength)) { - sortByChromThenScoreDesc = true; - sortChoices++; - } - else { - cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; - showHelp = true; - } - } - - // make sure we have both input files - if (!haveBed) { - cerr << endl << "*****" << endl << "*****ERROR: Need -i BED file. " << endl << "*****" << endl; - showHelp = true; - } - if (sortChoices > 1) { - cerr << endl << "*****" << endl << "*****ERROR: Sorting options are mutually exclusive. Please choose just one. " << endl << "*****" << endl; - showHelp = true; - } - - - if (!showHelp) { - BedSort *bm = new BedSort(bedFile); - - if (sortBySizeAsc) { - bm->SortBedBySizeAsc(); - } - else if (sortBySizeDesc) { - bm->SortBedBySizeDesc(); - } - else if (sortByChromThenSizeAsc) { - bm->SortBedByChromThenSizeAsc(); - } - else if (sortByChromThenSizeDesc) { - bm->SortBedByChromThenSizeDesc(); - } - else if (sortByChromThenScoreAsc) { - bm->SortBedByChromThenScoreAsc(); - } - else if (sortByChromThenScoreDesc) { - bm->SortBedByChromThenScoreDesc(); - } - else { - bm->SortBed(); - } - return 0; - } - else { - ShowHelp(); - } + // our configuration variables + bool showHelp = false; + + // input files + string bedFile = "stdin"; + bool haveBed = true; + int sortChoices = 0; + + bool sortBySizeAsc = false; + bool sortBySizeDesc = false; + bool sortByChromThenSizeAsc = false; + bool sortByChromThenSizeDesc = false; + bool sortByChromThenScoreAsc = false; + bool sortByChromThenScoreDesc = false; + + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if((PARAMETER_CHECK("-h", 2, parameterLength)) || + (PARAMETER_CHECK("--help", 5, parameterLength))) { + showHelp = true; + } + } + + if(showHelp) ShowHelp(); + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-i", 2, parameterLength)) { + if ((i+1) < argc) { + bedFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-sizeA", 6, parameterLength)) { + sortBySizeAsc = true; + sortChoices++; + } + else if(PARAMETER_CHECK("-sizeD", 6, parameterLength)) { + sortBySizeDesc = true; + sortChoices++; + } + else if(PARAMETER_CHECK("-chrThenSizeA", 13, parameterLength)) { + sortByChromThenSizeAsc = true; + sortChoices++; + } + else if(PARAMETER_CHECK("-chrThenSizeD", 13, parameterLength)) { + sortByChromThenSizeDesc = true; + sortChoices++; + } + else if(PARAMETER_CHECK("-chrThenScoreA", 14, parameterLength)) { + sortByChromThenScoreAsc = true; + sortChoices++; + } + else if(PARAMETER_CHECK("-chrThenScoreD", 14, parameterLength)) { + sortByChromThenScoreDesc = true; + sortChoices++; + } + else { + cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; + showHelp = true; + } + } + + // make sure we have both input files + if (!haveBed) { + cerr << endl << "*****" << endl << "*****ERROR: Need -i BED file. " << endl << "*****" << endl; + showHelp = true; + } + if (sortChoices > 1) { + cerr << endl << "*****" << endl << "*****ERROR: Sorting options are mutually exclusive. Please choose just one. " << endl << "*****" << endl; + showHelp = true; + } + + + if (!showHelp) { + BedSort *bm = new BedSort(bedFile); + + if (sortBySizeAsc) { + bm->SortBedBySizeAsc(); + } + else if (sortBySizeDesc) { + bm->SortBedBySizeDesc(); + } + else if (sortByChromThenSizeAsc) { + bm->SortBedByChromThenSizeAsc(); + } + else if (sortByChromThenSizeDesc) { + bm->SortBedByChromThenSizeDesc(); + } + else if (sortByChromThenScoreAsc) { + bm->SortBedByChromThenScoreAsc(); + } + else if (sortByChromThenScoreDesc) { + bm->SortBedByChromThenScoreDesc(); + } + else { + bm->SortBed(); + } + return 0; + } + else { + ShowHelp(); + } } void ShowHelp(void) { - cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; - - cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; - cerr << "Summary: Sorts a feature file in various and useful ways." << endl << endl; - cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bed/gff/vcf>" << endl << endl; - - cerr << "Options: " << endl; - cerr << "\t" << "-sizeA\t\t" << "Sort by feature size in ascending order." << endl; - cerr << "\t" << "-sizeD\t\t" << "Sort by feature size in descending order." << endl; - cerr << "\t" << "-chrThenSizeA\t" << "Sort by chrom (asc), then feature size (asc)." << endl; - cerr << "\t" << "-chrThenSizeD\t" << "Sort by chrom (asc), then feature size (desc)." << endl; - cerr << "\t" << "-chrThenScoreA\t" << "Sort by chrom (asc), then score (asc)." << endl; - cerr << "\t" << "-chrThenScoreD\t" << "Sort by chrom (asc), then score (desc)." << endl << endl; - - exit(1); + cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; + + cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; + cerr << "Summary: Sorts a feature file in various and useful ways." << endl << endl; + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bed/gff/vcf>" << endl << endl; + + cerr << "Options: " << endl; + cerr << "\t" << "-sizeA\t\t" << "Sort by feature size in ascending order." << endl; + cerr << "\t" << "-sizeD\t\t" << "Sort by feature size in descending order." << endl; + cerr << "\t" << "-chrThenSizeA\t" << "Sort by chrom (asc), then feature size (asc)." << endl; + cerr << "\t" << "-chrThenSizeD\t" << "Sort by chrom (asc), then feature size (desc)." << endl; + cerr << "\t" << "-chrThenScoreA\t" << "Sort by chrom (asc), then score (asc)." << endl; + cerr << "\t" << "-chrThenScoreD\t" << "Sort by chrom (asc), then score (desc)." << endl << endl; + + exit(1); } diff --git a/src/subtractBed/subtractBed.cpp b/src/subtractBed/subtractBed.cpp index 5d71c9d53282e7d7c56b1ac81e6fcedd6e8bddbd..c5a8f0439b8b53f1318aed6c12d36a3df5ffd7de 100644 --- a/src/subtractBed/subtractBed.cpp +++ b/src/subtractBed/subtractBed.cpp @@ -14,163 +14,163 @@ /* - Constructor + Constructor */ BedSubtract::BedSubtract(string &bedAFile, string &bedBFile, float &overlapFraction, bool &forceStrand) { - _bedAFile = bedAFile; - _bedBFile = bedBFile; - _overlapFraction = overlapFraction; - _forceStrand = forceStrand; + _bedAFile = bedAFile; + _bedBFile = bedBFile; + _overlapFraction = overlapFraction; + _forceStrand = forceStrand; - _bedA = new BedFile(bedAFile); - _bedB = new BedFile(bedBFile); - - SubtractBed(); + _bedA = new BedFile(bedAFile); + _bedB = new BedFile(bedBFile); + + SubtractBed(); } /* - Destructor + Destructor */ BedSubtract::~BedSubtract(void) { } void BedSubtract::FindAndSubtractOverlaps(BED &a, vector<BED> &hits) { - - // find all of the overlaps between a and B. - _bedB->FindOverlapsPerBin(a.chrom, a.start, a.end, a.strand, hits, _forceStrand); - - // is A completely spanned by an entry in B? - // if so, A should not be reported. - int numConsumedByB = 0; - int numOverlaps = 0; - vector<BED> bOverlaps; // list of hits in B. Special processing if there are multiple. - - vector<BED>::const_iterator h = hits.begin(); - vector<BED>::const_iterator hitsEnd = hits.end(); - for (; h != hitsEnd; ++h) { - - int s = max(a.start, h->start); - int e = min(a.end, h->end); - int overlapBases = (e - s); // the number of overlapping bases b/w a and b - int aLength = (a.end - a.start); // the length of a in b.p. - - if (s < e) { - - // is there enough overlap (default ~ 1bp) - float overlap = ((float) overlapBases / (float) aLength); - - if (overlap >= 1.0) { - numOverlaps++; - numConsumedByB++; - } - else if ( overlap >= _overlapFraction ) { - numOverlaps++; - bOverlaps.push_back(*h); - } - } - } - - if (numOverlaps == 0) { - // no overlap found, so just report A as-is. - _bedA->reportBedNewLine(a); - } - else if (numOverlaps == 1) { - // one overlap found. only need to look at the single - // entry in bOverlaps. - - // if A was not "consumed" by any entry in B - if (numConsumedByB == 0) { - - BED theHit = bOverlaps[0]; - - // A ++++++++++++ - // B ---- - // Res. ==== ==== - if ( (theHit.start > a.start) && (theHit.end < a.end) ) { - _bedA->reportBedRangeNewLine(a,a.start,theHit.start); - _bedA->reportBedRangeNewLine(a,theHit.end,a.end); - } - // A ++++++++++++ - // B ---------- - // Res. == - else if (theHit.start == a.start) { - _bedA->reportBedRangeNewLine(a,theHit.end,a.end); - } - // A ++++++++++++ - // B ---------- - // Res. ==== - else if (theHit.start < a.start) { - _bedA->reportBedRangeNewLine(a,theHit.end,a.end); - } - // A ++++++++++++ - // B ---------- - // Res. ======= - else if (theHit.start > a.start) { - _bedA->reportBedRangeNewLine(a,a.start,theHit.start); - } - } - } - else if (numOverlaps > 1) { - // multiple overlapz found. look at all the hits - // and figure out which bases in A survived. then - // report the contigous intervals that survived. - - vector<bool> aKeep(a.end - a.start, true); - - if (numConsumedByB == 0) { - // track the number of hit starts and ends at each position in A - for (vector<BED>::iterator h = bOverlaps.begin(); h != bOverlaps.end(); ++h) { - int s = max(a.start, h->start); - int e = min(a.end, h->end); - - for (int i = s+1; i <= e; ++i) { - aKeep[i-a.start-1] = false; - } - } - // report the remaining blocks. - for (unsigned int i = 0; i < aKeep.size(); ++i) { - if (aKeep[i] == true) { - CHRPOS blockStart = i + a.start; - while ((aKeep[i] == true) && (i < aKeep.size())) { - i++; - } - CHRPOS blockEnd = i + a.start; - blockEnd = min(a.end, blockEnd); - _bedA->reportBedRangeNewLine(a,blockStart,blockEnd); - } - } - } - } + + // find all of the overlaps between a and B. + _bedB->FindOverlapsPerBin(a.chrom, a.start, a.end, a.strand, hits, _forceStrand); + + // is A completely spanned by an entry in B? + // if so, A should not be reported. + int numConsumedByB = 0; + int numOverlaps = 0; + vector<BED> bOverlaps; // list of hits in B. Special processing if there are multiple. + + vector<BED>::const_iterator h = hits.begin(); + vector<BED>::const_iterator hitsEnd = hits.end(); + for (; h != hitsEnd; ++h) { + + int s = max(a.start, h->start); + int e = min(a.end, h->end); + int overlapBases = (e - s); // the number of overlapping bases b/w a and b + int aLength = (a.end - a.start); // the length of a in b.p. + + if (s < e) { + + // is there enough overlap (default ~ 1bp) + float overlap = ((float) overlapBases / (float) aLength); + + if (overlap >= 1.0) { + numOverlaps++; + numConsumedByB++; + } + else if ( overlap >= _overlapFraction ) { + numOverlaps++; + bOverlaps.push_back(*h); + } + } + } + + if (numOverlaps == 0) { + // no overlap found, so just report A as-is. + _bedA->reportBedNewLine(a); + } + else if (numOverlaps == 1) { + // one overlap found. only need to look at the single + // entry in bOverlaps. + + // if A was not "consumed" by any entry in B + if (numConsumedByB == 0) { + + BED theHit = bOverlaps[0]; + + // A ++++++++++++ + // B ---- + // Res. ==== ==== + if ( (theHit.start > a.start) && (theHit.end < a.end) ) { + _bedA->reportBedRangeNewLine(a,a.start,theHit.start); + _bedA->reportBedRangeNewLine(a,theHit.end,a.end); + } + // A ++++++++++++ + // B ---------- + // Res. == + else if (theHit.start == a.start) { + _bedA->reportBedRangeNewLine(a,theHit.end,a.end); + } + // A ++++++++++++ + // B ---------- + // Res. ==== + else if (theHit.start < a.start) { + _bedA->reportBedRangeNewLine(a,theHit.end,a.end); + } + // A ++++++++++++ + // B ---------- + // Res. ======= + else if (theHit.start > a.start) { + _bedA->reportBedRangeNewLine(a,a.start,theHit.start); + } + } + } + else if (numOverlaps > 1) { + // multiple overlapz found. look at all the hits + // and figure out which bases in A survived. then + // report the contigous intervals that survived. + + vector<bool> aKeep(a.end - a.start, true); + + if (numConsumedByB == 0) { + // track the number of hit starts and ends at each position in A + for (vector<BED>::iterator h = bOverlaps.begin(); h != bOverlaps.end(); ++h) { + int s = max(a.start, h->start); + int e = min(a.end, h->end); + + for (int i = s+1; i <= e; ++i) { + aKeep[i-a.start-1] = false; + } + } + // report the remaining blocks. + for (unsigned int i = 0; i < aKeep.size(); ++i) { + if (aKeep[i] == true) { + CHRPOS blockStart = i + a.start; + while ((aKeep[i] == true) && (i < aKeep.size())) { + i++; + } + CHRPOS blockEnd = i + a.start; + blockEnd = min(a.end, blockEnd); + _bedA->reportBedRangeNewLine(a,blockStart,blockEnd); + } + } + } + } } - + void BedSubtract::SubtractBed() { - // load the "B" bed file into a map so - // that we can easily compare "A" to it for overlaps - _bedB->loadBedFileIntoMap(); - - BED a, nullBed; - BedLineStatus bedStatus; - int lineNum = 0; // current input line number - vector<BED> hits; // vector of potential hits - // reserve some space - hits.reserve(100); - - _bedA->Open(); - while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) { - if (bedStatus == BED_VALID) { - FindAndSubtractOverlaps(a, hits); - hits.clear(); - a = nullBed; - } - } - _bedA->Close(); - + // load the "B" bed file into a map so + // that we can easily compare "A" to it for overlaps + _bedB->loadBedFileIntoMap(); + + BED a, nullBed; + BedLineStatus bedStatus; + int lineNum = 0; // current input line number + vector<BED> hits; // vector of potential hits + // reserve some space + hits.reserve(100); + + _bedA->Open(); + while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) { + if (bedStatus == BED_VALID) { + FindAndSubtractOverlaps(a, hits); + hits.clear(); + a = nullBed; + } + } + _bedA->Close(); + } // END Intersect diff --git a/src/subtractBed/subtractBed.h b/src/subtractBed/subtractBed.h index 23bba114b6d960f9134e0a9bcc4b994b2fdfe083..b613c21377e13e895ffb369e3406b2975608b4b2 100644 --- a/src/subtractBed/subtractBed.h +++ b/src/subtractBed/subtractBed.h @@ -26,27 +26,27 @@ class BedSubtract { public: - // constructor - BedSubtract(string &bedAFile, string &bedBFile, float &overlapFraction, bool &forceStrand); + // constructor + BedSubtract(string &bedAFile, string &bedBFile, float &overlapFraction, bool &forceStrand); - // destructor - ~BedSubtract(void); + // destructor + ~BedSubtract(void); private: - // processing variables - string _bedAFile; - string _bedBFile; - float _overlapFraction; - bool _noHit; - bool _forceStrand; - - // instances of bed file class. - BedFile *_bedA, *_bedB; - - // methods - void FindAndSubtractOverlaps(BED &a, vector<BED> &hits); - void SubtractBed(); + // processing variables + string _bedAFile; + string _bedBFile; + float _overlapFraction; + bool _noHit; + bool _forceStrand; + + // instances of bed file class. + BedFile *_bedA, *_bedB; + + // methods + void FindAndSubtractOverlaps(BED &a, vector<BED> &hits); + void SubtractBed(); }; #endif /* SUBTRACTBED_H */ diff --git a/src/subtractBed/subtractMain.cpp b/src/subtractBed/subtractMain.cpp index 8986b386aef2830adf442e52d6c487ed7520b37f..9225d3ff249cfabba2d99df81483d5ab4d36373a 100644 --- a/src/subtractBed/subtractMain.cpp +++ b/src/subtractBed/subtractMain.cpp @@ -26,108 +26,108 @@ void ShowHelp(void); int main(int argc, char* argv[]) { - // our configuration variables - bool showHelp = false; - - // input files - string bedAFile; - string bedBFile; - - // input arguments - float overlapFraction = 1E-9; - - bool haveBedA = false; - bool haveBedB = false; - bool haveFraction = false; - bool forceStrand = false; - - // check to see if we should print out some help - if(argc <= 1) showHelp = true; - - for(int i = 1; i < argc; i++) { - int parameterLength = (int)strlen(argv[i]); - - if((PARAMETER_CHECK("-h", 2, parameterLength)) || - (PARAMETER_CHECK("--help", 5, parameterLength))) { - showHelp = true; - } - } - - if(showHelp) ShowHelp(); - - // do some parsing (all of these parameters require 2 strings) - for(int i = 1; i < argc; i++) { - - int parameterLength = (int)strlen(argv[i]); - - if(PARAMETER_CHECK("-a", 2, parameterLength)) { - if ((i+1) < argc) { - haveBedA = true; - bedAFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-b", 2, parameterLength)) { - if ((i+1) < argc) { - haveBedB = true; - bedBFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-f", 2, parameterLength)) { - if ((i+1) < argc) { - haveFraction = true; - overlapFraction = atof(argv[i + 1]); - i++; - } - } - else if (PARAMETER_CHECK("-s", 2, parameterLength)) { - forceStrand = true; - } - else { - cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; - showHelp = true; - } - } - - // make sure we have both input files - if (!haveBedA || !haveBedB) { - cerr << endl << "*****" << endl << "*****ERROR: Need -a and -b files. " << endl << "*****" << endl; - showHelp = true; - } - - if (!showHelp) { - - BedSubtract *bs = new BedSubtract(bedAFile, bedBFile, overlapFraction, forceStrand); - delete bs; - return 0; - } - else { - ShowHelp(); - } + // our configuration variables + bool showHelp = false; + + // input files + string bedAFile; + string bedBFile; + + // input arguments + float overlapFraction = 1E-9; + + bool haveBedA = false; + bool haveBedB = false; + bool haveFraction = false; + bool forceStrand = false; + + // check to see if we should print out some help + if(argc <= 1) showHelp = true; + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if((PARAMETER_CHECK("-h", 2, parameterLength)) || + (PARAMETER_CHECK("--help", 5, parameterLength))) { + showHelp = true; + } + } + + if(showHelp) ShowHelp(); + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-a", 2, parameterLength)) { + if ((i+1) < argc) { + haveBedA = true; + bedAFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-b", 2, parameterLength)) { + if ((i+1) < argc) { + haveBedB = true; + bedBFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-f", 2, parameterLength)) { + if ((i+1) < argc) { + haveFraction = true; + overlapFraction = atof(argv[i + 1]); + i++; + } + } + else if (PARAMETER_CHECK("-s", 2, parameterLength)) { + forceStrand = true; + } + else { + cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; + showHelp = true; + } + } + + // make sure we have both input files + if (!haveBedA || !haveBedB) { + cerr << endl << "*****" << endl << "*****ERROR: Need -a and -b files. " << endl << "*****" << endl; + showHelp = true; + } + + if (!showHelp) { + + BedSubtract *bs = new BedSubtract(bedAFile, bedBFile, overlapFraction, forceStrand); + delete bs; + return 0; + } + else { + ShowHelp(); + } } void ShowHelp(void) { - cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; - - cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; + cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; - cerr << "Summary: Removes the portion(s) of an interval that is overlapped" << endl; - cerr << "\t by another feature(s)." << endl << endl; + cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; - cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <bed/gff/vcf> -b <bed/gff/vcf>" << endl << endl; + cerr << "Summary: Removes the portion(s) of an interval that is overlapped" << endl; + cerr << "\t by another feature(s)." << endl << endl; - cerr << "Options: " << endl; - cerr << "\t-f\t" << "Minimum overlap required as a fraction of A." << endl; - cerr << "\t\t- Default is 1E-9 (i.e., 1bp)." << endl; - cerr << "\t\t- (FLOAT) (e.g. 0.50)" << endl << endl; + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <bed/gff/vcf> -b <bed/gff/vcf>" << endl << endl; - cerr << "\t-s\t" << "Force strandedness. That is, only report hits in B that" << endl; - cerr << "\t\toverlap A on the same strand." << endl; - cerr << "\t\t- By default, overlaps are reported without respect to strand." << endl << endl; + cerr << "Options: " << endl; + cerr << "\t-f\t" << "Minimum overlap required as a fraction of A." << endl; + cerr << "\t\t- Default is 1E-9 (i.e., 1bp)." << endl; + cerr << "\t\t- (FLOAT) (e.g. 0.50)" << endl << endl; + cerr << "\t-s\t" << "Force strandedness. That is, only report hits in B that" << endl; + cerr << "\t\toverlap A on the same strand." << endl; + cerr << "\t\t- By default, overlaps are reported without respect to strand." << endl << endl; - // end the program here - exit(1); + + // end the program here + exit(1); } diff --git a/src/unionBedGraphs/intervalItem.h b/src/unionBedGraphs/intervalItem.h index 00591b1e208c0ea08619f11dc6e7aaa150e7b737..ffec8c68719931039f7674d8abdb6ee8a38c86b7 100644 --- a/src/unionBedGraphs/intervalItem.h +++ b/src/unionBedGraphs/intervalItem.h @@ -16,8 +16,8 @@ #include <queue> enum COORDINATE_TYPE { - START, - END + START, + END }; /* @@ -28,32 +28,32 @@ enum COORDINATE_TYPE { class IntervalItem { private: - IntervalItem(); + IntervalItem(); public: - int source_index; // which source BedGraph file this came from - COORDINATE_TYPE coord_type; // is this the start or the end position? - CHRPOS coord; - std::string depth; - - IntervalItem(int _index, COORDINATE_TYPE _type, CHRPOS _coord, std::string _depth) : - source_index(_index), - coord_type(_type), - coord(_coord), - depth(_depth) - {} - - IntervalItem(const IntervalItem &other) : - source_index(other.source_index), - coord_type(other.coord_type), - coord(other.coord), - depth(other.depth) - {} - - bool operator< ( const IntervalItem& other ) const - { - return this->coord > other.coord; - } + int source_index; // which source BedGraph file this came from + COORDINATE_TYPE coord_type; // is this the start or the end position? + CHRPOS coord; + std::string depth; + + IntervalItem(int _index, COORDINATE_TYPE _type, CHRPOS _coord, std::string _depth) : + source_index(_index), + coord_type(_type), + coord(_coord), + depth(_depth) + {} + + IntervalItem(const IntervalItem &other) : + source_index(other.source_index), + coord_type(other.coord_type), + coord(other.coord), + depth(other.depth) + {} + + bool operator< ( const IntervalItem& other ) const + { + return this->coord > other.coord; + } }; // our priority queue diff --git a/src/unionBedGraphs/unionBedGraphs.cpp b/src/unionBedGraphs/unionBedGraphs.cpp index 7eac9df02f11d7915fe85a128ffb9b372d4dcd16..998ca6beb5ea9e1c0501bd1831a93412bb2f26cc 100644 --- a/src/unionBedGraphs/unionBedGraphs.cpp +++ b/src/unionBedGraphs/unionBedGraphs.cpp @@ -23,233 +23,233 @@ using namespace std; UnionBedGraphs::UnionBedGraphs(std::ostream& _output, - const vector<string>& _filenames, - const vector<string>& _titles, - bool _print_empty_regions, - const std::string& _genome_size_filename, - const std::string& _no_coverage_value ) : - filenames(_filenames), - titles(_titles), - output(_output), - current_non_zero_inputs(0), - print_empty_regions(_print_empty_regions), - genome_sizes(NULL), - no_coverage_value(_no_coverage_value) + const vector<string>& _filenames, + const vector<string>& _titles, + bool _print_empty_regions, + const std::string& _genome_size_filename, + const std::string& _no_coverage_value ) : + filenames(_filenames), + titles(_titles), + output(_output), + current_non_zero_inputs(0), + print_empty_regions(_print_empty_regions), + genome_sizes(NULL), + no_coverage_value(_no_coverage_value) { - if (print_empty_regions) { - assert(!_genome_size_filename.empty()); + if (print_empty_regions) { + assert(!_genome_size_filename.empty()); - genome_sizes = new GenomeFile(_genome_size_filename); - } + genome_sizes = new GenomeFile(_genome_size_filename); + } } UnionBedGraphs::~UnionBedGraphs() { - CloseBedgraphFiles(); - if (genome_sizes) { - delete genome_sizes; - genome_sizes = NULL ; - } + CloseBedgraphFiles(); + if (genome_sizes) { + delete genome_sizes; + genome_sizes = NULL ; + } } void UnionBedGraphs::Union() { - OpenBedgraphFiles(); - - // Add the first interval from each file - for(size_t i=0;i<bedgraph_files.size();++i) - LoadNextBedgraphItem(i); - - // Chromosome loop - once per chromosome - do { - // Find the first chromosome to use - current_chrom = DetermineNextChrom(); - - // Populate the queue with initial values from all files - // (if they belong to the correct chromosome) - for(size_t i=0;i<bedgraph_files.size();++i) - AddInterval(i); - - CHRPOS current_start = ConsumeNextCoordinate(); - - // User wanted empty regions, and the first coordinate is not 0 - print a dummy empty coverage - if (print_empty_regions && current_start > 0) - PrintEmptyCoverage(0,current_start); - - // Intervals loop - until all intervals (of current chromosome) from all files are used. - do { - CHRPOS current_end = queue.top().coord; - PrintCoverage(current_start, current_end); - current_start = ConsumeNextCoordinate(); - } while (!queue.empty()); - - // User wanted empty regions, and the last coordinate is not the last coordinate of the chromosome - // print a dummy empty coverage - if (print_empty_regions) { - CHRPOS chrom_size = genome_sizes->getChromSize(current_chrom); - if (current_start < chrom_size) - PrintEmptyCoverage(current_start, chrom_size); - } - - } while (!AllFilesDone()); + OpenBedgraphFiles(); + + // Add the first interval from each file + for(size_t i=0;i<bedgraph_files.size();++i) + LoadNextBedgraphItem(i); + + // Chromosome loop - once per chromosome + do { + // Find the first chromosome to use + current_chrom = DetermineNextChrom(); + + // Populate the queue with initial values from all files + // (if they belong to the correct chromosome) + for(size_t i=0;i<bedgraph_files.size();++i) + AddInterval(i); + + CHRPOS current_start = ConsumeNextCoordinate(); + + // User wanted empty regions, and the first coordinate is not 0 - print a dummy empty coverage + if (print_empty_regions && current_start > 0) + PrintEmptyCoverage(0,current_start); + + // Intervals loop - until all intervals (of current chromosome) from all files are used. + do { + CHRPOS current_end = queue.top().coord; + PrintCoverage(current_start, current_end); + current_start = ConsumeNextCoordinate(); + } while (!queue.empty()); + + // User wanted empty regions, and the last coordinate is not the last coordinate of the chromosome + // print a dummy empty coverage + if (print_empty_regions) { + CHRPOS chrom_size = genome_sizes->getChromSize(current_chrom); + if (current_start < chrom_size) + PrintEmptyCoverage(current_start, chrom_size); + } + + } while (!AllFilesDone()); } CHRPOS UnionBedGraphs::ConsumeNextCoordinate() { - assert(!queue.empty()); + assert(!queue.empty()); - CHRPOS new_position = queue.top().coord; - do { - IntervalItem item = queue.top(); - UpdateInformation(item); - queue.pop(); - } while (!queue.empty() && queue.top().coord == new_position); + CHRPOS new_position = queue.top().coord; + do { + IntervalItem item = queue.top(); + UpdateInformation(item); + queue.pop(); + } while (!queue.empty() && queue.top().coord == new_position); - return new_position; + return new_position; } void UnionBedGraphs::UpdateInformation(const IntervalItem &item) { - // Update the depth coverage for this file - - // Which coordinate is it - start or end? - switch (item.coord_type) - { - case START: - current_depth[item.source_index] = item.depth; - current_non_zero_inputs++; - break; - case END: - //Read the next interval from this file - AddInterval(item.source_index); - current_depth[item.source_index] = no_coverage_value; - current_non_zero_inputs--; - break; - default: - assert(0); - } + // Update the depth coverage for this file + + // Which coordinate is it - start or end? + switch (item.coord_type) + { + case START: + current_depth[item.source_index] = item.depth; + current_non_zero_inputs++; + break; + case END: + //Read the next interval from this file + AddInterval(item.source_index); + current_depth[item.source_index] = no_coverage_value; + current_non_zero_inputs--; + break; + default: + assert(0); + } } void UnionBedGraphs::PrintHeader() { - output << "chrom\tstart\tend" ; - for (size_t i=0;i<titles.size();++i) - output << "\t" <<titles[i]; - output << endl; + output << "chrom\tstart\tend" ; + for (size_t i=0;i<titles.size();++i) + output << "\t" <<titles[i]; + output << endl; } void UnionBedGraphs::PrintCoverage(CHRPOS start, CHRPOS end) { - if ( current_non_zero_inputs == 0 && ! print_empty_regions ) - return ; + if ( current_non_zero_inputs == 0 && ! print_empty_regions ) + return ; - output << current_chrom << "\t" - << start << "\t" - << end; + output << current_chrom << "\t" + << start << "\t" + << end; - for (size_t i=0;i<current_depth.size();++i) - output << "\t" << current_depth[i] ; + for (size_t i=0;i<current_depth.size();++i) + output << "\t" << current_depth[i] ; - output << endl; + output << endl; } void UnionBedGraphs::PrintEmptyCoverage(CHRPOS start, CHRPOS end) { - output << current_chrom << "\t" - << start << "\t" - << end; + output << current_chrom << "\t" + << start << "\t" + << end; - for (size_t i=0;i<current_depth.size();++i) - output << "\t" << no_coverage_value ; + for (size_t i=0;i<current_depth.size();++i) + output << "\t" << no_coverage_value ; - output << endl; + output << endl; } void UnionBedGraphs::LoadNextBedgraphItem(int index) { - assert(static_cast<unsigned int>(index) < bedgraph_files.size()); + assert(static_cast<unsigned int>(index) < bedgraph_files.size()); - current_bedgraph_item[index].chrom=""; + current_bedgraph_item[index].chrom=""; - BedGraphFile *file = bedgraph_files[index]; - BEDGRAPH_STR bg; - int lineNum = 0; - BedGraphLineStatus status; + BedGraphFile *file = bedgraph_files[index]; + BEDGRAPH_STR bg; + int lineNum = 0; + BedGraphLineStatus status; - while ( (status = file->GetNextBedGraph(bg, lineNum)) != BEDGRAPH_INVALID ) { - if (status != BEDGRAPH_VALID) - continue; + while ( (status = file->GetNextBedGraph(bg, lineNum)) != BEDGRAPH_INVALID ) { + if (status != BEDGRAPH_VALID) + continue; - current_bedgraph_item[index] = bg ; - break; - } + current_bedgraph_item[index] = bg ; + break; + } } bool UnionBedGraphs::AllFilesDone() { - for (size_t i=0;i<current_bedgraph_item.size();++i) - if (!current_bedgraph_item[i].chrom.empty()) - return false; - return true; + for (size_t i=0;i<current_bedgraph_item.size();++i) + if (!current_bedgraph_item[i].chrom.empty()) + return false; + return true; } string UnionBedGraphs::DetermineNextChrom() { - string next_chrom; - for (size_t i=0;i<current_bedgraph_item.size();++i) { - if (current_bedgraph_item[i].chrom.empty()) - continue; - - if (next_chrom.empty()) - next_chrom = current_bedgraph_item[i].chrom; - else - if (current_bedgraph_item[i].chrom < next_chrom) - next_chrom = current_bedgraph_item[i].chrom ; - } - return next_chrom; + string next_chrom; + for (size_t i=0;i<current_bedgraph_item.size();++i) { + if (current_bedgraph_item[i].chrom.empty()) + continue; + + if (next_chrom.empty()) + next_chrom = current_bedgraph_item[i].chrom; + else + if (current_bedgraph_item[i].chrom < next_chrom) + next_chrom = current_bedgraph_item[i].chrom ; + } + return next_chrom; } void UnionBedGraphs::AddInterval(int index) { - assert(static_cast<unsigned int>(index) < bedgraph_files.size()); + assert(static_cast<unsigned int>(index) < bedgraph_files.size()); - //This file has no more intervals - if (current_bedgraph_item[index].chrom.empty()) - return ; + //This file has no more intervals + if (current_bedgraph_item[index].chrom.empty()) + return ; - //If the next interval belongs to a different chrom, don't add it - if (current_bedgraph_item[index].chrom!=current_chrom) - return ; + //If the next interval belongs to a different chrom, don't add it + if (current_bedgraph_item[index].chrom!=current_chrom) + return ; - const BEDGRAPH_STR &bg(current_bedgraph_item[index]); + const BEDGRAPH_STR &bg(current_bedgraph_item[index]); - IntervalItem start_item(index, START, bg.start, bg.depth); - IntervalItem end_item(index, END, bg.end, bg.depth); + IntervalItem start_item(index, START, bg.start, bg.depth); + IntervalItem end_item(index, END, bg.end, bg.depth); - queue.push(start_item); - queue.push(end_item); + queue.push(start_item); + queue.push(end_item); - LoadNextBedgraphItem(index); + LoadNextBedgraphItem(index); } void UnionBedGraphs::OpenBedgraphFiles() { - for (size_t i=0;i<filenames.size();++i) { - BedGraphFile *file = new BedGraphFile(filenames[i]); - file->Open(); - bedgraph_files.push_back(file); - - current_depth.push_back(no_coverage_value); - } - current_bedgraph_item.resize(filenames.size()); + for (size_t i=0;i<filenames.size();++i) { + BedGraphFile *file = new BedGraphFile(filenames[i]); + file->Open(); + bedgraph_files.push_back(file); + + current_depth.push_back(no_coverage_value); + } + current_bedgraph_item.resize(filenames.size()); } void UnionBedGraphs::CloseBedgraphFiles() { - for (size_t i=0;i<bedgraph_files.size();++i) { - BedGraphFile *file = bedgraph_files[i]; - delete file; - bedgraph_files[i] = NULL ; - } - bedgraph_files.clear(); + for (size_t i=0;i<bedgraph_files.size();++i) { + BedGraphFile *file = bedgraph_files[i]; + delete file; + bedgraph_files[i] = NULL ; + } + bedgraph_files.clear(); } diff --git a/src/unionBedGraphs/unionBedGraphs.h b/src/unionBedGraphs/unionBedGraphs.h index 852812a064dc3011baa2c056d245b7a60392d840..705e32f5f5c8f1d69456df5083bd84bc7a3b9f22 100644 --- a/src/unionBedGraphs/unionBedGraphs.h +++ b/src/unionBedGraphs/unionBedGraphs.h @@ -22,101 +22,101 @@ class UnionBedGraphs { private: - typedef BEDGRAPH_STR BEDGRAPH_TYPE; + typedef BEDGRAPH_STR BEDGRAPH_TYPE; - vector<string> filenames; - vector<string> titles; + vector<string> filenames; + vector<string> titles; - vector<BedGraphFile*> bedgraph_files; - vector<BEDGRAPH_TYPE::DEPTH_TYPE> current_depth; - vector<BEDGRAPH_TYPE> current_bedgraph_item; + vector<BedGraphFile*> bedgraph_files; + vector<BEDGRAPH_TYPE::DEPTH_TYPE> current_depth; + vector<BEDGRAPH_TYPE> current_bedgraph_item; - std::ostream &output; + std::ostream &output; - INTERVALS_PRIORITY_QUEUE queue; - std::string current_chrom; - int current_non_zero_inputs; - bool print_empty_regions; + INTERVALS_PRIORITY_QUEUE queue; + std::string current_chrom; + int current_non_zero_inputs; + bool print_empty_regions; - GenomeFile* genome_sizes; + GenomeFile* genome_sizes; - std::string no_coverage_value; + std::string no_coverage_value; public: - UnionBedGraphs(std::ostream& _output, - const vector<string>& _filenames, - const vector<string>& _titles, - bool _print_empty_regions, - const std::string& _genomeFileName, - const std::string& _no_coverage_value); + UnionBedGraphs(std::ostream& _output, + const vector<string>& _filenames, + const vector<string>& _titles, + bool _print_empty_regions, + const std::string& _genomeFileName, + const std::string& _no_coverage_value); - virtual ~UnionBedGraphs(); + virtual ~UnionBedGraphs(); - // Combines all bedgraph files - void Union(); + // Combines all bedgraph files + void Union(); - // Print the header line: chrom/start/end + name of each bedgraph file. - void PrintHeader(); + // Print the header line: chrom/start/end + name of each bedgraph file. + void PrintHeader(); private: - - // Open all BedGraph files, initialize "current_XXX" vectors - void OpenBedgraphFiles(); + + // Open all BedGraph files, initialize "current_XXX" vectors + void OpenBedgraphFiles(); // Close the BedGraph files. - void CloseBedgraphFiles(); - - /* - Add an interval from BedGraph file 'index' into the queue. - will only be added if it belongs to the current chromosome. - - If the interval was added (=consumed), the next interval will be read from the file - using 'LoadNextBedgraphItem' - */ - void AddInterval(int index); - - /* - Loads the next interval from BedGraph file 'index'. - Stores it in 'current_bedgraph_item' vector. - */ - void LoadNextBedgraphItem(int index); - - /* - Scans the 'current_bedgraph_item' vector, - find the 'first' chromosome to use (different BedGraph files can start with different chromosomes). - */ - std::string DetermineNextChrom(); - - /* - Returns 'true' if ALL intervals from ALL BedGraph files were used - */ - bool AllFilesDone(); - - /* - Extract the next coordinate from the queue, and updates the current coverage information. - If multiple interval share the same coordinate values, all of them are handled. - If an END coordinate is consumed, the next interval (from the corresponding file) is read. - */ - CHRPOS ConsumeNextCoordinate(); - - /* - Updates the coverage information based on the given item. - Item can be a START coordinate or an END coordiante. - */ - void UpdateInformation(const IntervalItem &item); - - /* - prints chrom/start/end and the current depth coverage values of all the files. - */ - void PrintCoverage(CHRPOS start, CHRPOS end); - - /* - prints chrom/start/end and the ZERO depth coverage values of all the files. - */ - void PrintEmptyCoverage(CHRPOS start, CHRPOS end); - - void DebugPrintQueue(); + void CloseBedgraphFiles(); + + /* + Add an interval from BedGraph file 'index' into the queue. + will only be added if it belongs to the current chromosome. + + If the interval was added (=consumed), the next interval will be read from the file + using 'LoadNextBedgraphItem' + */ + void AddInterval(int index); + + /* + Loads the next interval from BedGraph file 'index'. + Stores it in 'current_bedgraph_item' vector. + */ + void LoadNextBedgraphItem(int index); + + /* + Scans the 'current_bedgraph_item' vector, + find the 'first' chromosome to use (different BedGraph files can start with different chromosomes). + */ + std::string DetermineNextChrom(); + + /* + Returns 'true' if ALL intervals from ALL BedGraph files were used + */ + bool AllFilesDone(); + + /* + Extract the next coordinate from the queue, and updates the current coverage information. + If multiple interval share the same coordinate values, all of them are handled. + If an END coordinate is consumed, the next interval (from the corresponding file) is read. + */ + CHRPOS ConsumeNextCoordinate(); + + /* + Updates the coverage information based on the given item. + Item can be a START coordinate or an END coordiante. + */ + void UpdateInformation(const IntervalItem &item); + + /* + prints chrom/start/end and the current depth coverage values of all the files. + */ + void PrintCoverage(CHRPOS start, CHRPOS end); + + /* + prints chrom/start/end and the ZERO depth coverage values of all the files. + */ + void PrintEmptyCoverage(CHRPOS start, CHRPOS end); + + void DebugPrintQueue(); }; diff --git a/src/unionBedGraphs/unionBedGraphsMain.cpp b/src/unionBedGraphs/unionBedGraphsMain.cpp index c36d4e314850252c03f957da98bdaa122fbaca4b..b140b013d84b74ce2d5a2c27337431072b07d21c 100644 --- a/src/unionBedGraphs/unionBedGraphsMain.cpp +++ b/src/unionBedGraphs/unionBedGraphsMain.cpp @@ -43,127 +43,127 @@ void ShowExamples(void); int main(int argc, char* argv[]) { - bool haveFiles = false; - bool haveTitles = false; + bool haveFiles = false; + bool haveTitles = false; bool haveGenome = false; bool haveFiller = true; - bool printHeader = false; - bool printEmptyRegions = false; + bool printHeader = false; + bool printEmptyRegions = false; bool showHelp = false; - string genomeFile; - string basePath; - string noCoverageValue = "0"; - vector<string> inputFiles; - vector<string> inputTitles; - - //Parse command line options - if(argc <= 1) - ShowHelp(); - - for(int i = 1; i < argc; i++) { - int parameterLength = (int)strlen(argv[i]); - - if((PARAMETER_CHECK("-h", 2, parameterLength)) || - (PARAMETER_CHECK("--help", 5, parameterLength))) { - showHelp = true; - } - } - - if(showHelp == true) { - ShowHelp(); + string genomeFile; + string basePath; + string noCoverageValue = "0"; + vector<string> inputFiles; + vector<string> inputTitles; + + //Parse command line options + if(argc <= 1) + ShowHelp(); + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if((PARAMETER_CHECK("-h", 2, parameterLength)) || + (PARAMETER_CHECK("--help", 5, parameterLength))) { + showHelp = true; + } + } + + if(showHelp == true) { + ShowHelp(); exit(1); } - // do some parsing (all of these parameters require 2 strings) - for(int i = 1; i < argc; i++) { + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { - int parameterLength = (int)strlen(argv[i]); + int parameterLength = (int)strlen(argv[i]); - if(PARAMETER_CHECK("-i", 2, parameterLength)) { - if ((i+1) < argc) { - haveFiles = true; + if(PARAMETER_CHECK("-i", 2, parameterLength)) { + if ((i+1) < argc) { + haveFiles = true; i = i+1; string file = argv[i]; - while (file[0] != '-' && i < argc) { + while (file[0] != '-' && i < argc) { inputFiles.push_back(file); i++; if (i < argc) file = argv[i]; - } + } i--; - } - } - else if(PARAMETER_CHECK("-names", 6, parameterLength)) { - if ((i+1) < argc) { - haveTitles = true; + } + } + else if(PARAMETER_CHECK("-names", 6, parameterLength)) { + if ((i+1) < argc) { + haveTitles = true; i = i+1; string title = argv[i]; - while (title[0] != '-' && i < argc) { + while (title[0] != '-' && i < argc) { inputTitles.push_back(title); i++; if (i < argc) title = argv[i]; - } + } i--; - } - } - else if(PARAMETER_CHECK("-g", 2, parameterLength)) { - if ((i+1) < argc) { - haveGenome = true; - genomeFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-filler", 7, parameterLength)) { - if ((i+1) < argc) { - haveFiller = true; - noCoverageValue = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-header", 7, parameterLength)) { - printHeader = true; - } - else if(PARAMETER_CHECK("-empty", 6, parameterLength)) { - printEmptyRegions = true; - } - else if(PARAMETER_CHECK("-examples", 9, parameterLength)) { + } + } + else if(PARAMETER_CHECK("-g", 2, parameterLength)) { + if ((i+1) < argc) { + haveGenome = true; + genomeFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-filler", 7, parameterLength)) { + if ((i+1) < argc) { + haveFiller = true; + noCoverageValue = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-header", 7, parameterLength)) { + printHeader = true; + } + else if(PARAMETER_CHECK("-empty", 6, parameterLength)) { + printEmptyRegions = true; + } + else if(PARAMETER_CHECK("-examples", 9, parameterLength)) { ShowHelp(); ShowExamples(); exit(1); - } - } - - //Sanity checks - if (inputFiles.empty() == true) { - cerr << "Error: missing BedGraph file names (-i) to combine." << endl; - exit(1); - } - if (inputFiles.size() == 1) { - cerr << "Error: Only a single BedGraph file was specified. Nothing to combine, exiting." << endl; - exit(1); - } - if (printEmptyRegions && (genomeFile.empty() == true)) { - cerr << "Error: when using -empty, the genome sizes file (-g) must be specified using '-g FILE'." << endl; - exit(1); - } - if ((haveTitles == true) && (inputFiles.size() != inputTitles.size())) { - cerr << "Error: The number of file titles (-names) does not match the number of files (-i)." << endl; - exit(1); - } - - UnionBedGraphs ubg(cout, inputFiles, inputTitles, printEmptyRegions, genomeFile, noCoverageValue); - if (printHeader) - ubg.PrintHeader(); - ubg.Union(); + } + } + + //Sanity checks + if (inputFiles.empty() == true) { + cerr << "Error: missing BedGraph file names (-i) to combine." << endl; + exit(1); + } + if (inputFiles.size() == 1) { + cerr << "Error: Only a single BedGraph file was specified. Nothing to combine, exiting." << endl; + exit(1); + } + if (printEmptyRegions && (genomeFile.empty() == true)) { + cerr << "Error: when using -empty, the genome sizes file (-g) must be specified using '-g FILE'." << endl; + exit(1); + } + if ((haveTitles == true) && (inputFiles.size() != inputTitles.size())) { + cerr << "Error: The number of file titles (-names) does not match the number of files (-i)." << endl; + exit(1); + } + + UnionBedGraphs ubg(cout, inputFiles, inputTitles, printEmptyRegions, genomeFile, noCoverageValue); + if (printHeader) + ubg.PrintHeader(); + ubg.Union(); } void ShowHelp(void) { - - cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; - - cerr << "Authors: Assaf Gordon, CSHL" << endl; - cerr << " Aaron Quinlan (aaronquinlan@gmail.com)" << endl << endl; + + cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; + + cerr << "Authors: Assaf Gordon, CSHL" << endl; + cerr << " Aaron Quinlan (aaronquinlan@gmail.com)" << endl << endl; cerr << "Summary: Combines multiple BedGraph files into a single file," << endl; cerr << "\t allowing coverage comparisons between them." << endl << endl; @@ -172,123 +172,123 @@ void ShowHelp(void) { cerr << "\t Assumes that each BedGraph file is sorted by chrom/start " << endl; cerr << "\t and that the intervals in each are non-overlapping." << endl << endl; - cerr << "Options: " << endl; - - cerr << "\t-header\t\t" << "Print a header line." << endl; - cerr << "\t\t\t(chrom/start/end + names of each file)." << endl << endl; - - cerr << "\t-names\t\t" << "A list of names (one / file) to describe each file in -i." << endl; - cerr << "\t\t\tThese names will be printed in the header line." << endl << endl; - - cerr << "\t-g\t\t" << "Use genome file to calculate empty regions." << endl; - cerr << "\t\t\t- STRING." << endl << endl; - - cerr << "\t-empty\t\t" << "Report empty regions (i.e., start/end intervals w/o" << endl; - cerr << "\t\t\tvalues in all files)." << endl; - cerr << "\t\t\t- Requires the '-g FILE' parameter.\n" << endl; - - cerr << "\t-filler TEXT\t" << "Use TEXT when representing intervals having no value." << endl; - cerr << "\t\t\t- Default is '0', but you can use 'N/A' or any other text." << endl << endl; - - cerr << "\t-examples\t" << "Show detailed usage examples." << endl << endl; + cerr << "Options: " << endl; + + cerr << "\t-header\t\t" << "Print a header line." << endl; + cerr << "\t\t\t(chrom/start/end + names of each file)." << endl << endl; + + cerr << "\t-names\t\t" << "A list of names (one / file) to describe each file in -i." << endl; + cerr << "\t\t\tThese names will be printed in the header line." << endl << endl; + + cerr << "\t-g\t\t" << "Use genome file to calculate empty regions." << endl; + cerr << "\t\t\t- STRING." << endl << endl; + + cerr << "\t-empty\t\t" << "Report empty regions (i.e., start/end intervals w/o" << endl; + cerr << "\t\t\tvalues in all files)." << endl; + cerr << "\t\t\t- Requires the '-g FILE' parameter.\n" << endl; + + cerr << "\t-filler TEXT\t" << "Use TEXT when representing intervals having no value." << endl; + cerr << "\t\t\t- Default is '0', but you can use 'N/A' or any other text." << endl << endl; + + cerr << "\t-examples\t" << "Show detailed usage examples." << endl << endl; } void ShowExamples() { - cerr << "Example usage:\n\n" \ + cerr << "Example usage:\n\n" \ "== Input files: ==\n" \ "\n" \ " $ cat 1.bg\n" \ -" chr1 1000 1500 10\n" \ -" chr1 2000 2100 20\n" \ +" chr1 1000 1500 10\n" \ +" chr1 2000 2100 20\n" \ "\n" \ " $ cat 2.bg\n" \ -" chr1 900 1600 60\n" \ -" chr1 1700 2050 50\n" \ +" chr1 900 1600 60\n" \ +" chr1 1700 2050 50\n" \ "\n" \ " $ cat 3.bg\n" \ -" chr1 1980 2070 80\n" \ -" chr1 2090 2100 20\n" \ +" chr1 1980 2070 80\n" \ +" chr1 2090 2100 20\n" \ "\n" \ " $ cat sizes.txt\n" \ -" chr1 5000\n" \ +" chr1 5000\n" \ "\n" \ "== Union/combine the files: ==\n" \ "\n" \ " $ unionBedGraphs -i 1.bg 2.bg 3.bg\n" \ -" chr1 900 1000 0 60 0\n" \ -" chr1 1000 1500 10 60 0\n" \ -" chr1 1500 1600 0 60 0\n" \ -" chr1 1700 1980 0 50 0\n" \ -" chr1 1980 2000 0 50 80\n" \ -" chr1 2000 2050 20 50 80\n" \ -" chr1 2050 2070 20 0 80\n" \ -" chr1 2070 2090 20 0 0\n" \ -" chr1 2090 2100 20 0 20\n" \ +" chr1 900 1000 0 60 0\n" \ +" chr1 1000 1500 10 60 0\n" \ +" chr1 1500 1600 0 60 0\n" \ +" chr1 1700 1980 0 50 0\n" \ +" chr1 1980 2000 0 50 80\n" \ +" chr1 2000 2050 20 50 80\n" \ +" chr1 2050 2070 20 0 80\n" \ +" chr1 2070 2090 20 0 0\n" \ +" chr1 2090 2100 20 0 20\n" \ "\n" \ "== Union/combine the files, with a header line (titles are the file names): ==\n" \ "\n" \ " $ unionBedGraphs -header -i 1.bg 2.bg 3.bg\n" \ -" chrom start end 1 2 3\n" \ -" chr1 900 1000 0 60 0\n" \ -" chr1 1000 1500 10 60 0\n" \ -" chr1 1500 1600 0 60 0\n" \ -" chr1 1700 1980 0 50 0\n" \ -" chr1 1980 2000 0 50 80\n" \ -" chr1 2000 2050 20 50 80\n" \ -" chr1 2050 2070 20 0 80\n" \ -" chr1 2070 2090 20 0 0\n" \ -" chr1 2090 2100 20 0 20\n" \ +" chrom start end 1 2 3\n" \ +" chr1 900 1000 0 60 0\n" \ +" chr1 1000 1500 10 60 0\n" \ +" chr1 1500 1600 0 60 0\n" \ +" chr1 1700 1980 0 50 0\n" \ +" chr1 1980 2000 0 50 80\n" \ +" chr1 2000 2050 20 50 80\n" \ +" chr1 2050 2070 20 0 80\n" \ +" chr1 2070 2090 20 0 0\n" \ +" chr1 2090 2100 20 0 20\n" \ "\n" \ "== Union/combine the files, with a header line and custom names: ==\n" \ "\n" \ " $ unionBedGraphs -header -i 1.bg 2.bg 3.bg -names WT-1 WT-2 KO-1\n" \ -" chrom start end WT-1 WT-2 KO-1\n" \ -" chr1 900 1000 0 60 0\n" \ -" chr1 1000 1500 10 60 0\n" \ -" chr1 1500 1600 0 60 0\n" \ -" chr1 1700 1980 0 50 0\n" \ -" chr1 1980 2000 0 50 80\n" \ -" chr1 2000 2050 20 50 80\n" \ -" chr1 2050 2070 20 0 80\n" \ -" chr1 2070 2090 20 0 0\n" \ -" chr1 2090 2100 20 0 20\n" \ +" chrom start end WT-1 WT-2 KO-1\n" \ +" chr1 900 1000 0 60 0\n" \ +" chr1 1000 1500 10 60 0\n" \ +" chr1 1500 1600 0 60 0\n" \ +" chr1 1700 1980 0 50 0\n" \ +" chr1 1980 2000 0 50 80\n" \ +" chr1 2000 2050 20 50 80\n" \ +" chr1 2050 2070 20 0 80\n" \ +" chr1 2070 2090 20 0 0\n" \ +" chr1 2090 2100 20 0 20\n" \ "\n" \ "== Union/combine, showing empty regions (note, requires -g): ==\n" \ "\n" \ " $ unionBedGraphs -header -empty -g sizes.TXT -i 1.bg 2.bg 3.bg\n" \ -" chrom start end 1 2 3\n" \ -" chr1 0 900 0 0 0\n" \ -" chr1 900 1000 0 60 0\n" \ -" chr1 1000 1500 10 60 0\n" \ -" chr1 1500 1600 0 60 0\n" \ -" chr1 1600 1700 0 0 0\n" \ -" chr1 1700 1980 0 50 0\n" \ -" chr1 1980 2000 0 50 80\n" \ -" chr1 2000 2050 20 50 80\n" \ -" chr1 2050 2070 20 0 80\n" \ -" chr1 2070 2090 20 0 0\n" \ -" chr1 2090 2100 20 0 20\n" \ -" chr1 2100 5000 0 0 0\n" \ +" chrom start end 1 2 3\n" \ +" chr1 0 900 0 0 0\n" \ +" chr1 900 1000 0 60 0\n" \ +" chr1 1000 1500 10 60 0\n" \ +" chr1 1500 1600 0 60 0\n" \ +" chr1 1600 1700 0 0 0\n" \ +" chr1 1700 1980 0 50 0\n" \ +" chr1 1980 2000 0 50 80\n" \ +" chr1 2000 2050 20 50 80\n" \ +" chr1 2050 2070 20 0 80\n" \ +" chr1 2070 2090 20 0 0\n" \ +" chr1 2090 2100 20 0 20\n" \ +" chr1 2100 5000 0 0 0\n" \ "\n" \ ; } std::string stl_basename(const std::string& path) { - string result; + string result; - char* path_dup = strdup(path.c_str()); - char* basename_part = basename(path_dup); - result = basename_part; - free(path_dup); + char* path_dup = strdup(path.c_str()); + char* basename_part = basename(path_dup); + result = basename_part; + free(path_dup); - size_t pos = result.find_last_of('.'); - if (pos != string::npos ) - result = result.substr(0,pos); + size_t pos = result.find_last_of('.'); + if (pos != string::npos ) + result = result.substr(0,pos); - return result; + return result; } diff --git a/src/utils/BamTools/BGZF.cpp b/src/utils/BamTools/BGZF.cpp index 92afb96f83e058960aa4ebf0b2789de947944c0b..853d7b5194ee06e9e6a18b77369433b71337cc27 100644 --- a/src/utils/BamTools/BGZF.cpp +++ b/src/utils/BamTools/BGZF.cpp @@ -58,7 +58,7 @@ void BgzfData::Close(void) { int blockLength = DeflateBlock(); fwrite(CompressedBlock, 1, blockLength, Stream); } - + // flush and close fflush(Stream); fclose(Stream); @@ -84,14 +84,14 @@ int BgzfData::DeflateBlock(void) { // set compression level const int compressionLevel = ( IsWriteUncompressed ? 0 : Z_DEFAULT_COMPRESSION ); - + // loop to retry for blocks that do not compress enough int inputLength = BlockOffset; int compressedLength = 0; unsigned int bufferSize = CompressedBlockSize; while ( true ) { - + // initialize zstream values z_stream zs; zs.zalloc = NULL; @@ -182,7 +182,7 @@ void BgzfData::FlushBlock(void) { printf("BGZF ERROR: expected to write %u bytes during flushing, but wrote %u bytes.\n", blockLength, numBytesWritten); exit(1); } - + BlockAddress += blockLength; } } @@ -227,25 +227,25 @@ bool BgzfData::Open(const string& filename, const char* mode, bool isWriteUncomp // determine open mode if ( strcmp(mode, "rb") == 0 ) IsWriteOnly = false; - else if ( strcmp(mode, "wb") == 0) + else if ( strcmp(mode, "wb") == 0) IsWriteOnly = true; else { printf("BGZF ERROR: unknown file mode: %s\n", mode); - return false; + return false; } // ---------------------------------------------------------------- // open Stream to read to/write from file, stdin, or stdout // stdin/stdout option contributed by Aaron Quinlan (2010-Jan-03) - + // read/write BGZF data to/from a file if ( (filename != "stdin") && (filename != "stdout") ) Stream = fopen(filename.c_str(), mode); - + // read BGZF data from stdin else if ( (filename == "stdin") && (strcmp(mode, "rb") == 0 ) ) Stream = freopen(NULL, mode, stdin); - + // write BGZF data to stdout else if ( (filename == "stdout") && (strcmp(mode, "wb") == 0) ) Stream = freopen(NULL, mode, stdout); @@ -254,7 +254,7 @@ bool BgzfData::Open(const string& filename, const char* mode, bool isWriteUncomp printf("BGZF ERROR: unable to open file %s\n", filename.c_str() ); return false; } - + // set flags, return success IsOpen = true; IsWriteUncompressed = isWriteUncompressed; @@ -272,7 +272,7 @@ int BgzfData::Read(char* data, const unsigned int dataLength) { int bytesAvailable = BlockLength - BlockOffset; if ( bytesAvailable <= 0 ) { - if ( !ReadBlock() ) return -1; + if ( !ReadBlock() ) return -1; bytesAvailable = BlockLength - BlockOffset; if ( bytesAvailable <= 0 ) break; } @@ -300,7 +300,7 @@ bool BgzfData::ReadBlock(void) { char header[BLOCK_HEADER_LENGTH]; int64_t blockAddress = ftell64(Stream); - + int count = fread(header, 1, sizeof(header), Stream); if ( count == 0 ) { BlockLength = 0; @@ -329,7 +329,7 @@ bool BgzfData::ReadBlock(void) { } count = InflateBlock(blockLength); - if ( count < 0 ) { + if ( count < 0 ) { printf("BGZF ERROR: read block failed - could not decompress block data\n"); return false; } @@ -346,7 +346,7 @@ bool BgzfData::ReadBlock(void) { bool BgzfData::Seek(int64_t position) { if ( !IsOpen ) return false; - + int blockOffset = (position & 0xFFFF); int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL; @@ -363,9 +363,9 @@ bool BgzfData::Seek(int64_t position) { // get file position in BGZF file int64_t BgzfData::Tell(void) { - if ( !IsOpen ) + if ( !IsOpen ) return false; - else + else return ( (BlockAddress << 16) | (BlockOffset & 0xFFFF) ); } @@ -373,7 +373,7 @@ int64_t BgzfData::Tell(void) { unsigned int BgzfData::Write(const char* data, const unsigned int dataLen) { if ( !IsOpen || !IsWriteOnly ) return false; - + // initialize unsigned int numBytesWritten = 0; const char* input = data; @@ -381,7 +381,7 @@ unsigned int BgzfData::Write(const char* data, const unsigned int dataLen) { // copy the data to the buffer while ( numBytesWritten < dataLen ) { - + unsigned int copyLength = min(blockLength - BlockOffset, dataLen - numBytesWritten); char* buffer = UncompressedBlock; memcpy(buffer + BlockOffset, input, copyLength); diff --git a/src/utils/BamTools/BGZF.h b/src/utils/BamTools/BGZF.h index 37bcff75bd2a3a93bb9665c21b250f2b69321246..8a709f4d3a2098b4f4550a4dbdcbad0ef8444f85 100644 --- a/src/utils/BamTools/BGZF.h +++ b/src/utils/BamTools/BGZF.h @@ -33,7 +33,7 @@ #define fseek64(a,b,c) _fseeki64(a,b,c) #else #define ftell64(a) ftello(a) - #define fseek64(a,b,c) fseeko(a,b,c) + #define fseek64(a,b,c) fseeko(a,b,c) #endif #endif // BAMTOOLS_LFS @@ -49,7 +49,7 @@ typedef unsigned int uint32_t; typedef long long int64_t; typedef unsigned long long uint64_t; - #else + #else #include <stdint.h> #endif #endif // BAMTOOLS_TYPES @@ -97,7 +97,7 @@ struct BgzfData { ~BgzfData(void); // main interface methods - public: + public: // closes BGZF file void Close(void); // opens the BGZF file (mode is either "rb" for reading, or "wb" for writing) @@ -121,7 +121,7 @@ struct BgzfData { int InflateBlock(const int& blockLength); // reads a BGZF block bool ReadBlock(void); - + // static 'utility' methods public: // checks BGZF block header diff --git a/src/utils/BamTools/BamAncillary.cpp b/src/utils/BamTools/BamAncillary.cpp index d33eb159a7559cde10037692440ac33c978021c0..92b025580c9d754985555a993d72e3aa79ed499a 100644 --- a/src/utils/BamTools/BamAncillary.cpp +++ b/src/utils/BamTools/BamAncillary.cpp @@ -16,26 +16,26 @@ using namespace std; // acccctttggacct---ataggga.................aaaa // acccc---ggaccttttataggga.................aaaa // 5M 3D 6M 2I 7M 20N 4M - + namespace BamTools { - void getBamBlocks(const BamAlignment &bam, const RefVector &refs, + void getBamBlocks(const BamAlignment &bam, const RefVector &refs, vector<BED> &blocks, bool breakOnDeletionOps) { - - CHRPOS currPosition = bam.Position; + + CHRPOS currPosition = bam.Position; CHRPOS blockStart = bam.Position; string chrom = refs.at(bam.RefID).RefName; string name = bam.Name; string strand = "+"; string score = ToString(bam.MapQuality); - if (bam.IsReverseStrand()) strand = "-"; - - vector<CigarOp>::const_iterator cigItr = bam.CigarData.begin(); - vector<CigarOp>::const_iterator cigEnd = bam.CigarData.end(); + if (bam.IsReverseStrand()) strand = "-"; + + vector<CigarOp>::const_iterator cigItr = bam.CigarData.begin(); + vector<CigarOp>::const_iterator cigEnd = bam.CigarData.end(); for ( ; cigItr != cigEnd; ++cigItr ) { if (cigItr->Type == 'M') { currPosition += cigItr->Length; - blocks.push_back( BED(chrom, blockStart, currPosition, name, score, strand) ); - blockStart = currPosition; + blocks.push_back( BED(chrom, blockStart, currPosition, name, score, strand) ); + blockStart = currPosition; } else if (cigItr->Type == 'D') { if (breakOnDeletionOps == false) @@ -53,9 +53,9 @@ namespace BamTools { } else { cerr << "Input error: invalid CIGAR type (" << cigItr->Type - << ") for: " << bam.Name << endl; - exit(1); + << ") for: " << bam.Name << endl; + exit(1); } - } + } } } diff --git a/src/utils/BamTools/BamAncillary.h b/src/utils/BamTools/BamAncillary.h index 3f67825796aaa82644ab2f5e60626ec0ab1908bd..fcc04d7cde9b034c39e4573ac051f88291f496d4 100644 --- a/src/utils/BamTools/BamAncillary.h +++ b/src/utils/BamTools/BamAncillary.h @@ -14,6 +14,6 @@ #include "BamAux.h" namespace BamTools { - void getBamBlocks(const BamAlignment &bam, const RefVector &refs, + void getBamBlocks(const BamAlignment &bam, const RefVector &refs, vector<BED> &blocks, bool includeDeletions = true); } diff --git a/src/utils/BamTools/BamAux.h b/src/utils/BamTools/BamAux.h index 866ca15d13b9048c2ee4d0849c651c252dba6475..c3eb22c96cadc04a8b8f9c4fe363ead4a2c28a8e 100644 --- a/src/utils/BamTools/BamAux.h +++ b/src/utils/BamTools/BamAux.h @@ -56,7 +56,7 @@ const int BAM_CIGAR_SHIFT = 4; const int BAM_CIGAR_MASK = ((1 << BAM_CIGAR_SHIFT) - 1); // BAM index constants -const int MAX_BIN = 37450; // =(8^6-1)/7+1 +const int MAX_BIN = 37450; // =(8^6-1)/7+1 const int BAM_MIN_CHUNK_GAP = 32768; const int BAM_LIDX_SHIFT = 14; @@ -74,38 +74,38 @@ struct BamAlignment { ~BamAlignment(void); // Queries against alignment flags - public: - bool IsDuplicate(void) const; // Returns true if this read is a PCR duplicate - bool IsFailedQC(void) const; // Returns true if this read failed quality control - bool IsFirstMate(void) const; // Returns true if alignment is first mate on read - bool IsMapped(void) const; // Returns true if alignment is mapped - bool IsMateMapped(void) const; // Returns true if alignment's mate is mapped - bool IsMateReverseStrand(void) const; // Returns true if alignment's mate mapped to reverse strand - bool IsPaired(void) const; // Returns true if alignment part of paired-end read - bool IsPrimaryAlignment(void) const; // Returns true if reported position is primary alignment - bool IsProperPair(void) const; // Returns true if alignment is part of read that satisfied paired-end resolution - bool IsReverseStrand(void) const; // Returns true if alignment mapped to reverse strand - bool IsSecondMate(void) const; // Returns true if alignment is second mate on read + public: + bool IsDuplicate(void) const; // Returns true if this read is a PCR duplicate + bool IsFailedQC(void) const; // Returns true if this read failed quality control + bool IsFirstMate(void) const; // Returns true if alignment is first mate on read + bool IsMapped(void) const; // Returns true if alignment is mapped + bool IsMateMapped(void) const; // Returns true if alignment's mate is mapped + bool IsMateReverseStrand(void) const; // Returns true if alignment's mate mapped to reverse strand + bool IsPaired(void) const; // Returns true if alignment part of paired-end read + bool IsPrimaryAlignment(void) const; // Returns true if reported position is primary alignment + bool IsProperPair(void) const; // Returns true if alignment is part of read that satisfied paired-end resolution + bool IsReverseStrand(void) const; // Returns true if alignment mapped to reverse strand + bool IsSecondMate(void) const; // Returns true if alignment is second mate on read // Manipulate alignment flags - public: - void SetIsDuplicate(bool ok); // Sets "PCR duplicate" flag - void SetIsFailedQC(bool ok); // Sets "failed quality control" flag - void SetIsFirstMate(bool ok); // Sets "alignment is first mate" flag - void SetIsMateUnmapped(bool ok); // Sets "alignment's mate is mapped" flag - void SetIsMateReverseStrand(bool ok); // Sets "alignment's mate mapped to reverse strand" flag - void SetIsPaired(bool ok); // Sets "alignment part of paired-end read" flag - void SetIsProperPair(bool ok); // Sets "alignment is part of read that satisfied paired-end resolution" flag - void SetIsReverseStrand(bool ok); // Sets "alignment mapped to reverse strand" flag - void SetIsSecondaryAlignment(bool ok); // Sets "position is primary alignment" flag - void SetIsSecondMate(bool ok); // Sets "alignment is second mate on read" flag - void SetIsUnmapped(bool ok); // Sets "alignment is mapped" flag + public: + void SetIsDuplicate(bool ok); // Sets "PCR duplicate" flag + void SetIsFailedQC(bool ok); // Sets "failed quality control" flag + void SetIsFirstMate(bool ok); // Sets "alignment is first mate" flag + void SetIsMateUnmapped(bool ok); // Sets "alignment's mate is mapped" flag + void SetIsMateReverseStrand(bool ok); // Sets "alignment's mate mapped to reverse strand" flag + void SetIsPaired(bool ok); // Sets "alignment part of paired-end read" flag + void SetIsProperPair(bool ok); // Sets "alignment is part of read that satisfied paired-end resolution" flag + void SetIsReverseStrand(bool ok); // Sets "alignment mapped to reverse strand" flag + void SetIsSecondaryAlignment(bool ok); // Sets "position is primary alignment" flag + void SetIsSecondMate(bool ok); // Sets "alignment is second mate on read" flag + void SetIsUnmapped(bool ok); // Sets "alignment is mapped" flag // Tag data access methods public: // ------------------------------------------------------------------------------------- // N.B. - The following tag-modifying methods may not be used on BamAlignments fetched - // using BamReader::GetNextAlignmentCore(). Attempting to use them will not result in + // using BamReader::GetNextAlignmentCore(). Attempting to use them will not result in // error message (to keep output clean) but will ALWAYS return false. Only user- // generated BamAlignments or those retrieved using BamReader::GetNextAlignment() are valid. @@ -117,7 +117,7 @@ struct BamAlignment { bool AddTag(const std::string& tag, const std::string& type, const uint32_t& value); // type must be A or i bool AddTag(const std::string& tag, const std::string& type, const int32_t& value); // type must be A or i bool AddTag(const std::string& tag, const std::string& type, const float& value); // type must be A, i, or f - + // edit tag data (sets existing TAG with TYPE to VALUE or adds new TAG if not already present) // TYPE is one of {A, i, f, Z, H} depending on VALUE - see SAM/BAM spec for details // returns true if edit was successfaul, false if error @@ -128,14 +128,14 @@ struct BamAlignment { // specific tag data access methods - these only remain for legacy support bool GetEditDistance(uint32_t& editDistance) const; // get "NM" tag data (implemented as GetTag("NM", editDistance)) - bool GetReadGroup(std::string& readGroup) const; // get "RG" tag data (implemented as GetTag("RG", readGroup)) - - // generic tag data access methods - bool GetTag(const std::string& tag, std::string& destination) const; // access variable-length char or hex strings + bool GetReadGroup(std::string& readGroup) const; // get "RG" tag data (implemented as GetTag("RG", readGroup)) + + // generic tag data access methods + bool GetTag(const std::string& tag, std::string& destination) const; // access variable-length char or hex strings bool GetTag(const std::string& tag, uint32_t& destination) const; // access unsigned integer data bool GetTag(const std::string& tag, int32_t& destination) const; // access signed integer data bool GetTag(const std::string& tag, float& destination) const; // access floating point data - + // remove tag data // returns true if removal was successful, false if error // N.B. - returns false if TAG does not exist (no removal can occur) @@ -143,9 +143,9 @@ struct BamAlignment { // Additional data access methods public: - int GetEndPosition(bool usePadded = false) const; // calculates alignment end position, based on starting position and CIGAR operations + int GetEndPosition(bool usePadded = false) const; // calculates alignment end position, based on starting position and CIGAR operations - // 'internal' utility methods + // 'internal' utility methods private: static bool FindTag(const std::string& tag, char* &pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed); static bool SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed); @@ -162,16 +162,16 @@ struct BamAlignment { int32_t Position; // Position (0-based) where alignment starts uint16_t Bin; // Bin in BAM file where this alignment resides uint16_t MapQuality; // Mapping quality score - uint32_t AlignmentFlag; // Alignment bit-flag - see Is<something>() methods to query this value, SetIs<something>() methods to manipulate + uint32_t AlignmentFlag; // Alignment bit-flag - see Is<something>() methods to query this value, SetIs<something>() methods to manipulate std::vector<CigarOp> CigarData; // CIGAR operations for this alignment int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned int32_t MatePosition; // Position (0-based) where alignment's mate starts int32_t InsertSize; // Mate-pair insert size - + // internal data private: struct BamAlignmentSupportData { - + // data members std::string AllCharData; uint32_t BlockLength; @@ -179,7 +179,7 @@ struct BamAlignment { uint32_t QueryNameLength; uint32_t QuerySequenceLength; bool HasCoreOnly; - + // constructor BamAlignmentSupportData(void) : BlockLength(0) @@ -189,10 +189,10 @@ struct BamAlignment { , HasCoreOnly(false) { } }; - + // contains raw character data & lengths - BamAlignmentSupportData SupportData; - + BamAlignmentSupportData SupportData; + // allow these classes access to BamAlignment private members (SupportData) // but client code should not need to touch this data friend class BamReader; @@ -211,7 +211,7 @@ struct BamAlignment { , READ_2 = 128 , SECONDARY = 256 , QC_FAILED = 512 - , DUPLICATE = 1024 + , DUPLICATE = 1024 }; }; @@ -219,28 +219,28 @@ struct BamAlignment { // Auxiliary data structs & typedefs struct CigarOp { - + // data members char Type; // Operation type (MIDNSHP) uint32_t Length; // Operation length (number of bases) - + // constructor - CigarOp(const char type = '\0', - const uint32_t length = 0) + CigarOp(const char type = '\0', + const uint32_t length = 0) : Type(type) - , Length(length) + , Length(length) { } }; struct RefData { - + // data members std::string RefName; // Name of reference sequence int32_t RefLength; // Length of reference sequence bool RefHasAlignments; // True if BAM file contains alignments mapped to reference sequence - + // constructor - RefData(const int32_t& length = 0, + RefData(const int32_t& length = 0, bool ok = false) : RefLength(length) , RefHasAlignments(ok) @@ -251,15 +251,15 @@ typedef std::vector<RefData> RefVector; typedef std::vector<BamAlignment> BamAlignmentVector; struct BamRegion { - + // data members int LeftRefID; int LeftPosition; int RightRefID; int RightPosition; - + // constructor - BamRegion(const int& leftID = -1, + BamRegion(const int& leftID = -1, const int& leftPos = -1, const int& rightID = -1, const int& rightPos = -1) @@ -292,24 +292,24 @@ inline void SwapEndian_16(uint16_t& x) { // swaps endianness of 32-bit value 'in-place' inline void SwapEndian_32(int32_t& x) { - x = ( (x >> 24) | - ((x << 8) & 0x00FF0000) | - ((x >> 8) & 0x0000FF00) | + x = ( (x >> 24) | + ((x << 8) & 0x00FF0000) | + ((x >> 8) & 0x0000FF00) | (x << 24) ); } inline void SwapEndian_32(uint32_t& x) { - x = ( (x >> 24) | - ((x << 8) & 0x00FF0000) | - ((x >> 8) & 0x0000FF00) | + x = ( (x >> 24) | + ((x << 8) & 0x00FF0000) | + ((x >> 8) & 0x0000FF00) | (x << 24) ); } // swaps endianness of 64-bit value 'in-place' inline void SwapEndian_64(int64_t& x) { - x = ( (x >> 56) | + x = ( (x >> 56) | ((x << 40) & 0x00FF000000000000ll) | ((x << 24) & 0x0000FF0000000000ll) | ((x << 8) & 0x000000FF00000000ll) | @@ -321,7 +321,7 @@ inline void SwapEndian_64(int64_t& x) { } inline void SwapEndian_64(uint64_t& x) { - x = ( (x >> 56) | + x = ( (x >> 56) | ((x << 40) & 0x00FF000000000000ll) | ((x << 24) & 0x0000FF0000000000ll) | ((x << 8) & 0x000000FF00000000ll) | @@ -334,19 +334,19 @@ inline void SwapEndian_64(uint64_t& x) { // swaps endianness of 'next 2 bytes' in a char buffer (in-place) inline void SwapEndian_16p(char* data) { - uint16_t& value = (uint16_t&)*data; + uint16_t& value = (uint16_t&)*data; SwapEndian_16(value); } // swaps endianness of 'next 4 bytes' in a char buffer (in-place) inline void SwapEndian_32p(char* data) { - uint32_t& value = (uint32_t&)*data; + uint32_t& value = (uint32_t&)*data; SwapEndian_32(value); } // swaps endianness of 'next 8 bytes' in a char buffer (in-place) inline void SwapEndian_64p(char* data) { - uint64_t& value = (uint64_t&)*data; + uint64_t& value = (uint64_t&)*data; SwapEndian_64(value); } @@ -390,7 +390,7 @@ inline bool BamAlignment::IsProperPair(void) const { return ( (AlignmentF inline bool BamAlignment::IsReverseStrand(void) const { return ( (AlignmentFlag & REVERSE) != 0 ); } inline bool BamAlignment::IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); } -// Manipulate alignment flags +// Manipulate alignment flags inline void BamAlignment::SetIsDuplicate(bool ok) { if (ok) AlignmentFlag |= DUPLICATE; else AlignmentFlag &= ~DUPLICATE; } inline void BamAlignment::SetIsFailedQC(bool ok) { if (ok) AlignmentFlag |= QC_FAILED; else AlignmentFlag &= ~QC_FAILED; } inline void BamAlignment::SetIsFirstMate(bool ok) { if (ok) AlignmentFlag |= READ_1; else AlignmentFlag &= ~READ_1; } @@ -404,7 +404,7 @@ inline void BamAlignment::SetIsSecondMate(bool ok) { if (ok) AlignmentFl inline void BamAlignment::SetIsUnmapped(bool ok) { if (ok) AlignmentFlag |= UNMAPPED; else AlignmentFlag &= ~UNMAPPED; } // calculates alignment end position, based on starting position and CIGAR operations -inline +inline int BamAlignment::GetEndPosition(bool usePadded) const { // initialize alignment end to starting position @@ -414,10 +414,10 @@ int BamAlignment::GetEndPosition(bool usePadded) const { std::vector<CigarOp>::const_iterator cigarIter = CigarData.begin(); std::vector<CigarOp>::const_iterator cigarEnd = CigarData.end(); for ( ; cigarIter != cigarEnd; ++cigarIter) { - const char cigarType = (*cigarIter).Type; - if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' ) { - alignEnd += (*cigarIter).Length; - } + const char cigarType = (*cigarIter).Type; + if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' ) { + alignEnd += (*cigarIter).Length; + } else if ( usePadded && cigarType == 'I' ) { alignEnd += (*cigarIter).Length; } @@ -427,53 +427,53 @@ int BamAlignment::GetEndPosition(bool usePadded) const { inline bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const std::string& value) { - + if ( SupportData.HasCoreOnly ) return false; if ( tag.size() != 2 || type.size() != 1 ) return false; if ( type != "Z" && type != "H" ) return false; - + // localize the tag data char* pTagData = (char*)TagData.data(); const unsigned int tagDataLength = TagData.size(); unsigned int numBytesParsed = 0; - + // if tag already exists, return false // use EditTag explicitly instead if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false; - + // otherwise, copy tag data to temp buffer std::string newTag = tag + type + value; const int newTagDataLength = tagDataLength + newTag.size() + 1; // leave room for null-term char originalTagData[newTagDataLength]; memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term - + // append newTag strcat(originalTagData + tagDataLength, newTag.data()); // removes original null-term, appends newTag + null-term - + // store temp buffer back in TagData const char* newTagData = (const char*)originalTagData; TagData.assign(newTagData, newTagDataLength); - + // return success return true; } inline bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const uint32_t& value) { - + if ( SupportData.HasCoreOnly ) return false; if ( tag.size() != 2 || type.size() != 1 ) return false; if ( type == "f" || type == "Z" || type == "H" ) return false; - + // localize the tag data char* pTagData = (char*)TagData.data(); const unsigned int tagDataLength = TagData.size(); unsigned int numBytesParsed = 0; - + // if tag already exists, return false // use EditTag explicitly instead if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false; - + // otherwise, convert value to string union { unsigned int value; char valueBuffer[sizeof(unsigned int)]; } un; un.value = value; @@ -483,15 +483,15 @@ bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new integer char originalTagData[newTagDataLength]; memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term - + // append newTag strcat(originalTagData + tagDataLength, newTag.data()); memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(unsigned int)); - + // store temp buffer back in TagData const char* newTagData = (const char*)originalTagData; TagData.assign(newTagData, newTagDataLength); - + // return success return true; } @@ -503,20 +503,20 @@ bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const inline bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const float& value) { - + if ( SupportData.HasCoreOnly ) return false; if ( tag.size() != 2 || type.size() != 1 ) return false; if ( type == "Z" || type == "H" ) return false; - + // localize the tag data char* pTagData = (char*)TagData.data(); const unsigned int tagDataLength = TagData.size(); unsigned int numBytesParsed = 0; - + // if tag already exists, return false // use EditTag explicitly instead if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false; - + // otherwise, convert value to string union { float value; char valueBuffer[sizeof(float)]; } un; un.value = value; @@ -526,120 +526,120 @@ bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new float char originalTagData[newTagDataLength]; memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term - + // append newTag strcat(originalTagData + tagDataLength, newTag.data()); memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(float)); - + // store temp buffer back in TagData const char* newTagData = (const char*)originalTagData; TagData.assign(newTagData, newTagDataLength); - + // return success return true; } inline bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const std::string& value) { - + if ( SupportData.HasCoreOnly ) return false; if ( tag.size() != 2 || type.size() != 1 ) return false; if ( type != "Z" && type != "H" ) return false; - + // localize the tag data char* pOriginalTagData = (char*)TagData.data(); char* pTagData = pOriginalTagData; const unsigned int originalTagDataLength = TagData.size(); - + unsigned int newTagDataLength = 0; unsigned int numBytesParsed = 0; - + // if tag found, store data in readGroup, return success if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { - + // make sure array is more than big enough - char newTagData[originalTagDataLength + value.size()]; + char newTagData[originalTagDataLength + value.size()]; // copy original tag data up til desired tag const unsigned int beginningTagDataLength = numBytesParsed; newTagDataLength += beginningTagDataLength; memcpy(newTagData, pOriginalTagData, numBytesParsed); - + // copy new VALUE in place of current tag data const unsigned int dataLength = strlen(value.c_str()); memcpy(newTagData + beginningTagDataLength, (char*)value.c_str(), dataLength+1 ); - - // skip to next tag (if tag for removal is last, return true) + + // skip to next tag (if tag for removal is last, return true) const char* pTagStorageType = pTagData - 1; if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true; - + // copy everything from current tag (the next one after tag for removal) to end const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); const unsigned int endTagOffset = beginningTagDataLength + dataLength + 1; const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; memcpy(newTagData + endTagOffset, pTagData, endTagDataLength); - + // ensure null-terminator newTagData[ endTagOffset + endTagDataLength + 1 ] = 0; - + // save new tag data TagData.assign(newTagData, endTagOffset + endTagDataLength); return true; } - + // tag not found, attempt AddTag else return AddTag(tag, type, value); } inline bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const uint32_t& value) { - + if ( SupportData.HasCoreOnly ) return false; if ( tag.size() != 2 || type.size() != 1 ) return false; if ( type == "f" || type == "Z" || type == "H" ) return false; - + // localize the tag data char* pOriginalTagData = (char*)TagData.data(); char* pTagData = pOriginalTagData; const unsigned int originalTagDataLength = TagData.size(); - + unsigned int newTagDataLength = 0; unsigned int numBytesParsed = 0; - + // if tag found, store data in readGroup, return success if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { - + // make sure array is more than big enough - char newTagData[originalTagDataLength + sizeof(value)]; + char newTagData[originalTagDataLength + sizeof(value)]; // copy original tag data up til desired tag const unsigned int beginningTagDataLength = numBytesParsed; newTagDataLength += beginningTagDataLength; memcpy(newTagData, pOriginalTagData, numBytesParsed); - + // copy new VALUE in place of current tag data union { unsigned int value; char valueBuffer[sizeof(unsigned int)]; } un; un.value = value; memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(unsigned int)); - - // skip to next tag (if tag for removal is last, return true) + + // skip to next tag (if tag for removal is last, return true) const char* pTagStorageType = pTagData - 1; if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true; - + // copy everything from current tag (the next one after tag for removal) to end const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); const unsigned int endTagOffset = beginningTagDataLength + sizeof(unsigned int); const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; memcpy(newTagData + endTagOffset, pTagData, endTagDataLength); - + // ensure null-terminator newTagData[ endTagOffset + endTagDataLength + 1 ] = 0; - + // save new tag data TagData.assign(newTagData, endTagOffset + endTagDataLength); return true; } - + // tag not found, attempt AddTag else return AddTag(tag, type, value); } @@ -651,67 +651,67 @@ bool BamAlignment::EditTag(const std::string& tag, const std::string& type, cons inline bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const float& value) { - + if ( SupportData.HasCoreOnly ) return false; if ( tag.size() != 2 || type.size() != 1 ) return false; if ( type == "Z" || type == "H" ) return false; - + // localize the tag data char* pOriginalTagData = (char*)TagData.data(); char* pTagData = pOriginalTagData; const unsigned int originalTagDataLength = TagData.size(); - + unsigned int newTagDataLength = 0; unsigned int numBytesParsed = 0; - + // if tag found, store data in readGroup, return success if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { - + // make sure array is more than big enough - char newTagData[originalTagDataLength + sizeof(value)]; + char newTagData[originalTagDataLength + sizeof(value)]; // copy original tag data up til desired tag const unsigned int beginningTagDataLength = numBytesParsed; newTagDataLength += beginningTagDataLength; memcpy(newTagData, pOriginalTagData, numBytesParsed); - + // copy new VALUE in place of current tag data union { float value; char valueBuffer[sizeof(float)]; } un; un.value = value; memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(float)); - - // skip to next tag (if tag for removal is last, return true) + + // skip to next tag (if tag for removal is last, return true) const char* pTagStorageType = pTagData - 1; if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true; - + // copy everything from current tag (the next one after tag for removal) to end const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); const unsigned int endTagOffset = beginningTagDataLength + sizeof(float); const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; memcpy(newTagData + endTagOffset, pTagData, endTagDataLength); - + // ensure null-terminator newTagData[ endTagOffset + endTagDataLength + 1 ] = 0; - + // save new tag data TagData.assign(newTagData, endTagOffset + endTagDataLength); return true; } - + // tag not found, attempt AddTag else return AddTag(tag, type, value); } // get "NM" tag data - originally contributed by Aaron Quinlan // stores data in 'editDistance', returns success/fail -inline -bool BamAlignment::GetEditDistance(uint32_t& editDistance) const { +inline +bool BamAlignment::GetEditDistance(uint32_t& editDistance) const { return GetTag("NM", (uint32_t&)editDistance); } // get "RG" tag data // stores data in 'readGroup', returns success/fail -inline +inline bool BamAlignment::GetReadGroup(std::string& readGroup) const { return GetTag("RG", readGroup); } @@ -720,14 +720,14 @@ inline bool BamAlignment::GetTag(const std::string& tag, std::string& destination) const { // make sure tag data exists - if ( SupportData.HasCoreOnly || TagData.empty() ) + if ( SupportData.HasCoreOnly || TagData.empty() ) return false; // localize the tag data char* pTagData = (char*)TagData.data(); const unsigned int tagDataLength = TagData.size(); unsigned int numBytesParsed = 0; - + // if tag found, store data in readGroup, return success if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { const unsigned int dataLength = strlen(pTagData); @@ -736,26 +736,26 @@ bool BamAlignment::GetTag(const std::string& tag, std::string& destination) cons memcpy( (char*)destination.data(), pTagData, dataLength ); return true; } - + // tag not found, return failure return false; } inline bool BamAlignment::GetTag(const std::string& tag, uint32_t& destination) const { - + // make sure tag data exists - if ( SupportData.HasCoreOnly || TagData.empty() ) + if ( SupportData.HasCoreOnly || TagData.empty() ) return false; // localize the tag data char* pTagData = (char*)TagData.data(); const unsigned int tagDataLength = TagData.size(); unsigned int numBytesParsed = 0; - + // if tag found, determine data byte-length, store data in readGroup, return success if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { - + // determine data byte-length const char type = *(pTagData - 1); int destinationLength = 0; @@ -791,13 +791,13 @@ bool BamAlignment::GetTag(const std::string& tag, uint32_t& destination) const { printf("ERROR: Unknown tag storage class encountered: [%c]\n", type); return false; } - + // store in destination destination = 0; memcpy(&destination, pTagData, destinationLength); return true; } - + // tag not found, return failure return false; } @@ -809,20 +809,20 @@ bool BamAlignment::GetTag(const std::string& tag, int32_t& destination) const { inline bool BamAlignment::GetTag(const std::string& tag, float& destination) const { - + // make sure tag data exists - if ( SupportData.HasCoreOnly || TagData.empty() ) + if ( SupportData.HasCoreOnly || TagData.empty() ) return false; // localize the tag data char* pTagData = (char*)TagData.data(); const unsigned int tagDataLength = TagData.size(); unsigned int numBytesParsed = 0; - + // if tag found, determine data byte-length, store data in readGroup, return success if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { //pTagData += numBytesParsed; - + // determine data byte-length const char type = *(pTagData - 1); int destinationLength = 0; @@ -847,7 +847,7 @@ bool BamAlignment::GetTag(const std::string& tag, float& destination) const { case 'I': destinationLength = 4; break; - + // unsupported type (var-length strings) case 'Z': case 'H': @@ -859,34 +859,34 @@ bool BamAlignment::GetTag(const std::string& tag, float& destination) const { printf("ERROR: Unknown tag storage class encountered: [%c]\n", type); return false; } - + // store in destination destination = 0.0; memcpy(&destination, pTagData, destinationLength); return true; } - + // tag not found, return failure return false; } inline bool BamAlignment::RemoveTag(const std::string& tag) { - + // BamAlignments fetched using BamReader::GetNextAlignmentCore() are not allowed // also, return false if no data present to remove if ( SupportData.HasCoreOnly || TagData.empty() ) return false; - + // localize the tag data char* pOriginalTagData = (char*)TagData.data(); char* pTagData = pOriginalTagData; const unsigned int originalTagDataLength = TagData.size(); unsigned int newTagDataLength = 0; unsigned int numBytesParsed = 0; - + // if tag found, store data in readGroup, return success if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { - + char newTagData[originalTagDataLength]; // copy original tag data up til desired tag @@ -895,23 +895,23 @@ bool BamAlignment::RemoveTag(const std::string& tag) { const unsigned int beginningTagDataLength = numBytesParsed; newTagDataLength += beginningTagDataLength; memcpy(newTagData, pOriginalTagData, numBytesParsed); - - // skip to next tag (if tag for removal is last, return true) + + // skip to next tag (if tag for removal is last, return true) const char* pTagStorageType = pTagData + 2; pTagData += 3; numBytesParsed += 3; if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true; - + // copy everything from current tag (the next one after tag for removal) to end const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; memcpy(newTagData + beginningTagDataLength, pTagData, endTagDataLength ); - + // save new tag data TagData.assign(newTagData, beginningTagDataLength + endTagDataLength); return true; } - + // tag not found, no removal - return failure return false; } @@ -927,22 +927,22 @@ bool BamAlignment::FindTag(const std::string& tag, char* &pTagData, const unsign numBytesParsed += 3; // check the current tag, return true on match - if ( std::strncmp(pTagType, tag.c_str(), 2) == 0 ) + if ( std::strncmp(pTagType, tag.c_str(), 2) == 0 ) return true; // get the storage class and find the next tag - if ( *pTagStorageType == '\0' ) return false; + if ( *pTagStorageType == '\0' ) return false; if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return false; if ( *pTagData == '\0' ) return false; } - + // checked all tags, none match return false; } inline bool BamAlignment::SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) { - + switch(storageType) { case 'A': @@ -976,12 +976,12 @@ bool BamAlignment::SkipToNextTag(const char storageType, char* &pTagData, unsign ++pTagData; break; - default: + default: // error case printf("ERROR: Unknown tag storage class encountered: [%c]\n", storageType); return false; } - + // return success return true; } diff --git a/src/utils/BamTools/BamIndex.cpp b/src/utils/BamTools/BamIndex.cpp index d74e751cd902c546760afa42e3ba7c5bbe90b0ca..a3fa09e9a11fb307e106fe965b1cbd346c8c2bc4 100644 --- a/src/utils/BamTools/BamIndex.cpp +++ b/src/utils/BamTools/BamIndex.cpp @@ -5,8 +5,8 @@ // --------------------------------------------------------------------------- // Last modified: 17 August 2010 (DB) // --------------------------------------------------------------------------- -// Provides index functionality - both for the default (standardized) BAM -// index format (.bai) as well as a BamTools-specific (nonstandard) index +// Provides index functionality - both for the default (standardized) BAM +// index format (.bai) as well as a BamTools-specific (nonstandard) index // format (.bti). // *************************************************************************** @@ -24,21 +24,21 @@ using namespace BamTools; // ------------------------------- // BamIndex implementation -BamIndex::BamIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader, bool isBigEndian) +BamIndex::BamIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader, bool isBigEndian) : m_BGZF(bgzf) , m_reader(reader) , m_isBigEndian(isBigEndian) -{ - if ( m_reader && m_reader->IsOpen() ) +{ + if ( m_reader && m_reader->IsOpen() ) m_references = m_reader->GetReferenceData(); } bool BamIndex::HasAlignments(const int& referenceID) { - + // return false if invalid ID - if ( (referenceID < 0) || (referenceID >= (int)m_references.size()) ) + if ( (referenceID < 0) || (referenceID >= (int)m_references.size()) ) return false; - + // else return status of reference (has alignments?) else return m_references.at(referenceID).RefHasAlignments; @@ -48,9 +48,9 @@ bool BamIndex::HasAlignments(const int& referenceID) { // ######################################################################################### // ------------------------------- -// BamDefaultIndex structs & typedefs - -namespace BamTools { +// BamDefaultIndex structs & typedefs + +namespace BamTools { // -------------------------------------------------- // BamDefaultIndex data structures & typedefs @@ -61,7 +61,7 @@ struct Chunk { uint64_t Stop; // constructor - Chunk(const uint64_t& start = 0, + Chunk(const uint64_t& start = 0, const uint64_t& stop = 0) : Start(start) , Stop(stop) @@ -77,11 +77,11 @@ typedef map<uint32_t, ChunkVector> BamBinMap; typedef vector<uint64_t> LinearOffsetVector; struct ReferenceIndex { - + // data members BamBinMap Bins; LinearOffsetVector Offsets; - + // constructor ReferenceIndex(const BamBinMap& binMap = BamBinMap(), const LinearOffsetVector& offsets = LinearOffsetVector()) @@ -93,27 +93,27 @@ struct ReferenceIndex { typedef vector<ReferenceIndex> BamDefaultIndexData; } // namespace BamTools - + // ------------------------------- // BamDefaultIndex implementation - -struct BamDefaultIndex::BamDefaultIndexPrivate { - + +struct BamDefaultIndex::BamDefaultIndexPrivate { + // ------------------------- // data members - + BamDefaultIndexData m_indexData; BamDefaultIndex* m_parent; - + // ------------------------- // ctor & dtor - + BamDefaultIndexPrivate(BamDefaultIndex* parent) : m_parent(parent) { } ~BamDefaultIndexPrivate(void) { } - + // ------------------------- // internal methods - + // calculate bins that overlap region int BinsFromRegion(const BamTools::BamRegion& region, const bool isRightBoundSpecified, uint16_t bins[BamTools::MAX_BIN]); // saves BAM bin entry for index @@ -122,14 +122,14 @@ struct BamDefaultIndex::BamDefaultIndexPrivate { void InsertLinearOffset(LinearOffsetVector& offsets, const BamAlignment& bAlignment, const uint64_t& lastOffset); // simplifies index by merging 'chunks' void MergeChunks(void); - + }; - + BamDefaultIndex::BamDefaultIndex(BgzfData* bgzf, BamReader* reader, bool isBigEndian) : BamIndex(bgzf, reader, isBigEndian) { d = new BamDefaultIndexPrivate(this); -} +} BamDefaultIndex::~BamDefaultIndex(void) { d->m_indexData.clear(); @@ -139,20 +139,20 @@ BamDefaultIndex::~BamDefaultIndex(void) { // calculate bins that overlap region int BamDefaultIndex::BamDefaultIndexPrivate::BinsFromRegion(const BamRegion& region, const bool isRightBoundSpecified, uint16_t bins[MAX_BIN]) { - + // get region boundaries uint32_t begin = (unsigned int)region.LeftPosition; uint32_t end; - + // if right bound specified AND left&right bounds are on same reference // OK to use right bound position if ( isRightBoundSpecified && ( region.LeftRefID == region.RightRefID ) ) end = (unsigned int)region.RightPosition; - + // otherwise, use end of left bound reference as cutoff else end = (unsigned int)m_parent->m_references.at(region.LeftRefID).RefLength - 1; - + // initialize list, bin '0' always a valid bin int i = 0; bins[i++] = 0; @@ -169,10 +169,10 @@ int BamDefaultIndex::BamDefaultIndexPrivate::BinsFromRegion(const BamRegion& reg return i; } -bool BamDefaultIndex::Build(void) { - +bool BamDefaultIndex::Build(void) { + // be sure reader & BGZF file are valid & open for reading - if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen ) + if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen ) return false; // move file pointer to beginning of alignments @@ -183,7 +183,7 @@ bool BamDefaultIndex::Build(void) { for ( int i = 0; i < numReferences; ++i ) { d->m_indexData.push_back(ReferenceIndex()); } - + // sets default constant for bin, ID, offset, coordinate variables const uint32_t defaultValue = 0xffffffffu; @@ -276,7 +276,7 @@ bool BamDefaultIndex::Build(void) { // simplify index by merging chunks d->MergeChunks(); - + // iterate through references in index // store whether reference has data & // sort offsets in linear offset vector @@ -300,8 +300,8 @@ bool BamDefaultIndex::Build(void) { return m_reader->Rewind(); } -bool BamDefaultIndex::GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector<int64_t>& offsets) { - +bool BamDefaultIndex::GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector<int64_t>& offsets) { + // calculate which bins overlap this region uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2); int numBins = d->BinsFromRegion(region, isRightBoundSpecified, bins); @@ -316,7 +316,7 @@ bool BamDefaultIndex::GetOffsets(const BamRegion& region, const bool isRightBoun // store all alignment 'chunk' starts (file offsets) for bins in this region for ( int i = 0; i < numBins; ++i ) { - + const uint16_t binKey = bins[i]; map<uint32_t, ChunkVector>::const_iterator binIter = binMap.find(binKey); if ( (binIter != binMap.end()) && ((*binIter).first == binKey) ) { @@ -325,7 +325,7 @@ bool BamDefaultIndex::GetOffsets(const BamRegion& region, const bool isRightBoun std::vector<Chunk>::const_iterator chunksIter = chunks.begin(); std::vector<Chunk>::const_iterator chunksEnd = chunks.end(); for ( ; chunksIter != chunksEnd; ++chunksIter) { - + // if valid chunk found, store its file offset const Chunk& chunk = (*chunksIter); if ( chunk.Stop > minOffset ) @@ -339,7 +339,7 @@ bool BamDefaultIndex::GetOffsets(const BamRegion& region, const bool isRightBoun // sort the offsets before returning sort(offsets.begin(), offsets.end()); - + // return whether any offsets were found return ( offsets.size() != 0 ); } @@ -387,13 +387,13 @@ void BamDefaultIndex::BamDefaultIndexPrivate::InsertLinearOffset(LinearOffsetVec // store offset for( int i = beginOffset + 1; i <= endOffset; ++i ) { - if ( offsets[i] == 0 ) + if ( offsets[i] == 0 ) offsets[i] = lastOffset; } -} +} + +bool BamDefaultIndex::Load(const string& filename) { -bool BamDefaultIndex::Load(const string& filename) { - // open index file, abort on error FILE* indexStream = fopen(filename.c_str(), "rb"); if( !indexStream ) { @@ -403,7 +403,7 @@ bool BamDefaultIndex::Load(const string& filename) { // set placeholder to receive input byte count (suppresses compiler warnings) size_t elementsRead = 0; - + // see if index is valid BAM index char magic[4]; elementsRead = fread(magic, 1, 4, indexStream); @@ -417,7 +417,7 @@ bool BamDefaultIndex::Load(const string& filename) { uint32_t numRefSeqs; elementsRead = fread(&numRefSeqs, 4, 1, indexStream); if ( m_isBigEndian ) { SwapEndian_32(numRefSeqs); } - + // intialize space for BamDefaultIndexData data structure d->m_indexData.reserve(numRefSeqs); @@ -448,11 +448,11 @@ bool BamDefaultIndex::Load(const string& filename) { uint32_t numChunks; elementsRead = fread(&numChunks, 4, 1, indexStream); - if ( m_isBigEndian ) { + if ( m_isBigEndian ) { SwapEndian_32(binID); SwapEndian_32(numChunks); } - + // intialize ChunkVector ChunkVector regionChunks; regionChunks.reserve(numChunks); @@ -470,7 +470,7 @@ bool BamDefaultIndex::Load(const string& filename) { SwapEndian_64(left); SwapEndian_64(right); } - + // save ChunkPair regionChunks.push_back( Chunk(left, right) ); } @@ -571,9 +571,9 @@ void BamDefaultIndex::BamDefaultIndexPrivate::MergeChunks(void) { } } -// writes in-memory index data out to file +// writes in-memory index data out to file // N.B. - (this is the original BAM filename, method will modify it to use applicable extension) -bool BamDefaultIndex::Write(const std::string& bamFilename) { +bool BamDefaultIndex::Write(const std::string& bamFilename) { string indexFilename = bamFilename + ".bai"; FILE* indexStream = fopen(indexFilename.c_str(), "wb"); @@ -637,7 +637,7 @@ bool BamDefaultIndex::Write(const std::string& bamFilename) { SwapEndian_64(start); SwapEndian_64(stop); } - + // save chunk offsets fwrite(&start, 8, 1, indexStream); fwrite(&stop, 8, 1, indexStream); @@ -674,14 +674,14 @@ bool BamDefaultIndex::Write(const std::string& bamFilename) { // BamToolsIndex implementation namespace BamTools { - + struct BamToolsIndexEntry { - + // data members int64_t Offset; int RefID; int Position; - + // ctor BamToolsIndexEntry(const uint64_t& offset = 0, const int& id = -1, @@ -693,51 +693,51 @@ struct BamToolsIndexEntry { }; typedef vector<BamToolsIndexEntry> BamToolsIndexData; - + } // namespace BamTools struct BamToolsIndex::BamToolsIndexPrivate { - + // ------------------------- // data members BamToolsIndexData m_indexData; BamToolsIndex* m_parent; int32_t m_blockSize; - + // ------------------------- // ctor & dtor - - BamToolsIndexPrivate(BamToolsIndex* parent) + + BamToolsIndexPrivate(BamToolsIndex* parent) : m_parent(parent) , m_blockSize(1000) { } - + ~BamToolsIndexPrivate(void) { } - + // ------------------------- // internal methods }; BamToolsIndex::BamToolsIndex(BgzfData* bgzf, BamReader* reader, bool isBigEndian) : BamIndex(bgzf, reader, isBigEndian) -{ +{ d = new BamToolsIndexPrivate(this); -} +} -BamToolsIndex::~BamToolsIndex(void) { +BamToolsIndex::~BamToolsIndex(void) { delete d; d = 0; } -bool BamToolsIndex::Build(void) { - +bool BamToolsIndex::Build(void) { + // be sure reader & BGZF file are valid & open for reading - if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen ) + if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen ) return false; // move file pointer to beginning of alignments m_reader->Rewind(); - + // plow through alignments, store block offsets int32_t currentBlockCount = 0; int64_t blockStartOffset = m_BGZF->Tell(); @@ -745,68 +745,68 @@ bool BamToolsIndex::Build(void) { int blockStartPosition = -1; BamAlignment al; while ( m_reader->GetNextAlignmentCore(al) ) { - + // set reference flag m_references[al.RefID].RefHasAlignments = true; - + // if beginning of block, save first alignment's refID & position if ( currentBlockCount == 0 ) { blockStartId = al.RefID; blockStartPosition = al.Position; } - + // increment block counter ++currentBlockCount; - + // if block is full, get offset for next block, reset currentBlockCount if ( currentBlockCount == d->m_blockSize ) { - + d->m_indexData.push_back( BamToolsIndexEntry(blockStartOffset, blockStartId, blockStartPosition) ); blockStartOffset = m_BGZF->Tell(); currentBlockCount = 0; } } - + return m_reader->Rewind(); } // N.B. - ignores isRightBoundSpecified -bool BamToolsIndex::GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector<int64_t>& offsets) { - - // return false if no index data present +bool BamToolsIndex::GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector<int64_t>& offsets) { + + // return false if no index data present if ( d->m_indexData.empty() ) return false; - + // clear any prior data offsets.clear(); - + // calculate nearest index to jump to int64_t previousOffset = -1; BamToolsIndexData::const_iterator indexIter = d->m_indexData.begin(); BamToolsIndexData::const_iterator indexEnd = d->m_indexData.end(); for ( ; indexIter != indexEnd; ++indexIter ) { - + const BamToolsIndexEntry& entry = (*indexIter); - + // check if we are 'past' beginning of desired region // if so, we will break out & use previously stored offset if ( entry.RefID > region.LeftRefID ) break; if ( (entry.RefID == region.LeftRefID) && (entry.Position > region.LeftPosition) ) break; - + // not past desired region, so store current entry offset in previousOffset previousOffset = entry.Offset; } - + // no index was found - if ( previousOffset == -1 ) + if ( previousOffset == -1 ) return false; - + // store offset & return success offsets.push_back(previousOffset); - return true; + return true; } -bool BamToolsIndex::Load(const string& filename) { - +bool BamToolsIndex::Load(const string& filename) { + // open index file, abort on error FILE* indexStream = fopen(filename.c_str(), "rb"); if( !indexStream ) { @@ -816,7 +816,7 @@ bool BamToolsIndex::Load(const string& filename) { // set placeholder to receive input byte count (suppresses compiler warnings) size_t elementsRead = 0; - + // see if index is valid BAM index char magic[4]; elementsRead = fread(magic, 1, 4, indexStream); @@ -829,37 +829,37 @@ bool BamToolsIndex::Load(const string& filename) { // read in block size elementsRead = fread(&d->m_blockSize, sizeof(d->m_blockSize), 1, indexStream); if ( m_isBigEndian ) { SwapEndian_32(d->m_blockSize); } - + // read in number of offsets uint32_t numOffsets; elementsRead = fread(&numOffsets, sizeof(numOffsets), 1, indexStream); if ( m_isBigEndian ) { SwapEndian_32(numOffsets); } - + // reserve space for index data d->m_indexData.reserve(numOffsets); // iterate over index entries for ( unsigned int i = 0; i < numOffsets; ++i ) { - + uint64_t offset; int id; int position; - + // read in data elementsRead = fread(&offset, sizeof(offset), 1, indexStream); elementsRead = fread(&id, sizeof(id), 1, indexStream); elementsRead = fread(&position, sizeof(position), 1, indexStream); - + // swap endian-ness if necessary if ( m_isBigEndian ) { SwapEndian_64(offset); SwapEndian_32(id); SwapEndian_32(position); } - + // save reference index entry d->m_indexData.push_back( BamToolsIndexEntry(offset, id, position) ); - + // set reference flag m_references[id].RefHasAlignments = true; // what about sparse references? wont be able to set flag? } @@ -869,10 +869,10 @@ bool BamToolsIndex::Load(const string& filename) { return true; } -// writes in-memory index data out to file +// writes in-memory index data out to file // N.B. - (this is the original BAM filename, method will modify it to use applicable extension) -bool BamToolsIndex::Write(const std::string& bamFilename) { - +bool BamToolsIndex::Write(const std::string& bamFilename) { + string indexFilename = bamFilename + ".bti"; FILE* indexStream = fopen(indexFilename.c_str(), "wb"); if ( indexStream == 0 ) { @@ -887,12 +887,12 @@ bool BamToolsIndex::Write(const std::string& bamFilename) { int32_t blockSize = d->m_blockSize; if ( m_isBigEndian ) { SwapEndian_32(blockSize); } fwrite(&blockSize, sizeof(blockSize), 1, indexStream); - + // write number of offset entries uint32_t numOffsets = d->m_indexData.size(); if ( m_isBigEndian ) { SwapEndian_32(numOffsets); } fwrite(&numOffsets, sizeof(numOffsets), 1, indexStream); - + // iterate over offset entries BamToolsIndexData::const_iterator indexIter = d->m_indexData.begin(); BamToolsIndexData::const_iterator indexEnd = d->m_indexData.end(); @@ -900,19 +900,19 @@ bool BamToolsIndex::Write(const std::string& bamFilename) { // get reference index data const BamToolsIndexEntry& entry = (*indexIter); - + // copy entry data uint64_t offset = entry.Offset; int id = entry.RefID; int position = entry.Position; - + // swap endian-ness if necessary if ( m_isBigEndian ) { SwapEndian_64(offset); SwapEndian_32(id); SwapEndian_32(position); } - + // write the reference index entry fwrite(&offset, sizeof(offset), 1, indexStream); fwrite(&id, sizeof(id), 1, indexStream); diff --git a/src/utils/BamTools/BamIndex.h b/src/utils/BamTools/BamIndex.h index b9ce7d03121b34f8fdb3c2640fe689c41b0954ec..ade11483b6da0f1948de97c9dda2fc277ddda060 100644 --- a/src/utils/BamTools/BamIndex.h +++ b/src/utils/BamTools/BamIndex.h @@ -5,8 +5,8 @@ // --------------------------------------------------------------------------- // Last modified: 17 August 2010 (DB) // --------------------------------------------------------------------------- -// Provides index functionality - both for the default (standardized) BAM -// index format (.bai) as well as a BamTools-specific (nonstandard) index +// Provides index functionality - both for the default (standardized) BAM +// index format (.bai) as well as a BamTools-specific (nonstandard) index // format (.bti). // *************************************************************************** @@ -21,13 +21,13 @@ namespace BamTools { class BamReader; class BgzfData; - -// -------------------------------------------------- + +// -------------------------------------------------- // BamIndex base class class BamIndex { public: - BamIndex(BamTools::BgzfData* bgzf, + BamIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader, bool isBigEndian); virtual ~BamIndex(void) { } @@ -40,11 +40,11 @@ class BamIndex { // loads existing data from file into memory virtual bool Load(const std::string& filename) =0; // returns whether reference has alignments or no - virtual bool HasAlignments(const int& referenceID); - // writes in-memory index data out to file + virtual bool HasAlignments(const int& referenceID); + // writes in-memory index data out to file // N.B. - (this is the original BAM filename, method will modify it to use applicable extension) virtual bool Write(const std::string& bamFilename) =0; - + protected: BamTools::BgzfData* m_BGZF; BamTools::BamReader* m_reader; @@ -54,18 +54,18 @@ class BamIndex { // -------------------------------------------------- // BamDefaultIndex class -// +// // implements default (per SAM/BAM spec) index file ops class BamDefaultIndex : public BamIndex { - + // ctor & dtor public: - BamDefaultIndex(BamTools::BgzfData* bgzf, + BamDefaultIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader, bool isBigEndian); ~BamDefaultIndex(void); - + // interface (implements BamIndex virtual methods) public: // creates index data (in-memory) from current reader data @@ -74,10 +74,10 @@ class BamDefaultIndex : public BamIndex { bool GetOffsets(const BamTools::BamRegion& region, const bool isRightBoundSpecified, std::vector<int64_t>& offsets); // loads existing data from file into memory bool Load(const std::string& filename); - // writes in-memory index data out to file + // writes in-memory index data out to file // N.B. - (this is the original BAM filename, method will modify it to use applicable extension) bool Write(const std::string& bamFilename); - + // internal implementation private: struct BamDefaultIndexPrivate; @@ -92,11 +92,11 @@ class BamToolsIndex : public BamIndex { // ctor & dtor public: - BamToolsIndex(BamTools::BgzfData* bgzf, + BamToolsIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader, bool isBigEndian); ~BamToolsIndex(void); - + // interface (implements BamIndex virtual methods) public: // creates index data (in-memory) from current reader data @@ -105,10 +105,10 @@ class BamToolsIndex : public BamIndex { bool GetOffsets(const BamTools::BamRegion& region, const bool isRightBoundSpecified, std::vector<int64_t>& offsets); // loads existing data from file into memory bool Load(const std::string& filename); - // writes in-memory index data out to file + // writes in-memory index data out to file // N.B. - (this is the original BAM filename, method will modify it to use applicable extension) bool Write(const std::string& bamFilename); - + // internal implementation private: struct BamToolsIndexPrivate; diff --git a/src/utils/BamTools/BamMultiReader.cpp b/src/utils/BamTools/BamMultiReader.cpp index 005b0b0adc7c6c22c12f4317791785803a974bd6..11d48daff295b0d602b78edfb1c24104637614e3 100644 --- a/src/utils/BamTools/BamMultiReader.cpp +++ b/src/utils/BamTools/BamMultiReader.cpp @@ -137,7 +137,7 @@ bool BamMultiReader::GetNextAlignmentCore(BamAlignment& nextAlignment) { // and add another entry if we can get another alignment from the reader if (reader->GetNextAlignmentCore(*alignment)) { - alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position), + alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position), make_pair(reader, alignment))); } else { // do nothing //cerr << "reached end of file " << lowestReader->GetFilename() << endl; @@ -201,7 +201,7 @@ void BamMultiReader::UpdateAlignments(void) { BamReader* br = it->first; BamAlignment* ba = it->second; if (br->GetNextAlignment(*ba)) { - alignments.insert(make_pair(make_pair(ba->RefID, ba->Position), + alignments.insert(make_pair(make_pair(ba->RefID, ba->Position), make_pair(br, ba))); } else { // assume BamReader end of region / EOF @@ -211,7 +211,7 @@ void BamMultiReader::UpdateAlignments(void) { // opens BAM files bool BamMultiReader::Open(const vector<string> filenames, bool openIndexes, bool coreMode, bool useDefaultIndex) { - + // for filename in filenames fileNames = filenames; // save filenames in our multireader for (vector<string>::const_iterator it = filenames.begin(); it != filenames.end(); ++it) { @@ -222,15 +222,15 @@ bool BamMultiReader::Open(const vector<string> filenames, bool openIndexes, bool if (openIndexes) { if (useDefaultIndex) openedOK = reader->Open(filename, filename + ".bai"); - else + else openedOK = reader->Open(filename, filename + ".bti"); } else { openedOK = reader->Open(filename); // for merging, jumping is disallowed } - + // if file opened ok, check that it can be read if ( openedOK ) { - + bool fileOK = true; BamAlignment* alignment = new BamAlignment; if (coreMode) { @@ -238,7 +238,7 @@ bool BamMultiReader::Open(const vector<string> filenames, bool openIndexes, bool } else { fileOK &= reader->GetNextAlignment(*alignment); } - + if (fileOK) { readers.push_back(make_pair(reader, alignment)); // store pointers to our readers for cleanup alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position), @@ -248,11 +248,11 @@ bool BamMultiReader::Open(const vector<string> filenames, bool openIndexes, bool // if only file available & could not be read, return failure if ( filenames.size() == 1 ) return false; } - - } - + + } + // TODO; any more error handling on openedOK ?? - else + else return false; } @@ -277,7 +277,7 @@ void BamMultiReader::DumpAlignmentIndex(void) { } // returns BAM file pointers to beginning of alignment data -bool BamMultiReader::Rewind(void) { +bool BamMultiReader::Rewind(void) { bool result = true; for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) { BamReader* reader = it->first; @@ -351,7 +351,7 @@ const string BamMultiReader::GetHeaderText(void) const { // warn iff we are reading one file and discover duplicated @RG tags in the header // otherwise, we emit no warning, as we might be merging multiple BAM files with identical @RG tags if (currentFileReadGroups.find(readGroup) != currentFileReadGroups.end()) { - cerr << "WARNING: duplicate @RG tag " << readGroup + cerr << "WARNING: duplicate @RG tag " << readGroup << " entry in header of " << reader->GetFilename() << endl; } } @@ -377,7 +377,7 @@ void BamMultiReader::ValidateReaders(void) const { BamTools::RefVector::const_iterator c = currentRefData.begin(); if (reader->GetReferenceCount() != firstRefCount || firstRefData.size() != currentRefData.size()) { cerr << "ERROR: mismatched number of references in " << reader->GetFilename() - << " expected " << firstRefCount + << " expected " << firstRefCount << " reference sequences but only found " << reader->GetReferenceCount() << endl; exit(1); } @@ -415,6 +415,6 @@ const BamTools::RefVector BamMultiReader::GetReferenceData(void) const { return readers.front().first->GetReferenceData(); } -const int BamMultiReader::GetReferenceID(const string& refName) const { +const int BamMultiReader::GetReferenceID(const string& refName) const { return readers.front().first->GetReferenceID(refName); } diff --git a/src/utils/BamTools/BamReader.cpp b/src/utils/BamTools/BamReader.cpp index b1b1b5de2ab79a76416bd5267f6dc5cc1ecccd02..d6355769144f5d2e7f628251942a04afc672205b 100644 --- a/src/utils/BamTools/BamReader.cpp +++ b/src/utils/BamTools/BamReader.cpp @@ -49,7 +49,7 @@ struct BamReader::BamReaderPrivate { int64_t AlignmentsBeginOffset; string Filename; string IndexFilename; - + // system data bool IsBigEndian; @@ -57,14 +57,14 @@ struct BamReader::BamReaderPrivate { BamRegion Region; bool IsLeftBoundSpecified; bool IsRightBoundSpecified; - + bool IsRegionSpecified; int CurrentRefID; int CurrentLeft; // parent BamReader BamReader* Parent; - + // BAM character constants const char* DNA_LOOKUP; const char* CIGAR_LOOKUP; @@ -138,12 +138,12 @@ BamReader::~BamReader(void) { // file operations void BamReader::Close(void) { d->Close(); } bool BamReader::IsOpen(void) const { return d->mBGZF.IsOpen; } -bool BamReader::Jump(int refID, int position) { +bool BamReader::Jump(int refID, int position) { d->Region.LeftRefID = refID; d->Region.LeftPosition = position; d->IsLeftBoundSpecified = true; d->IsRightBoundSpecified = false; - return d->Jump(refID, position); + return d->Jump(refID, position); } bool BamReader::Open(const string& filename, const string& indexFilename) { return d->Open(filename, indexFilename); } bool BamReader::Rewind(void) { return d->Rewind(); } @@ -183,7 +183,7 @@ BamReader::BamReaderPrivate::BamReaderPrivate(BamReader* parent) , Parent(parent) , DNA_LOOKUP("=ACMGRSVTWYHKDBN") , CIGAR_LOOKUP("MIDNSHP") -{ +{ IsBigEndian = SystemIsBigEndian(); } @@ -193,7 +193,7 @@ BamReader::BamReaderPrivate::~BamReaderPrivate(void) { } bool BamReader::BamReaderPrivate::BuildCharData(BamAlignment& bAlignment) { - + // calculate character lengths/offsets const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE; const unsigned int cigarDataOffset = bAlignment.SupportData.QueryNameLength; @@ -201,18 +201,18 @@ bool BamReader::BamReaderPrivate::BuildCharData(BamAlignment& bAlignment) { const unsigned int qualDataOffset = seqDataOffset + (bAlignment.SupportData.QuerySequenceLength+1)/2; const unsigned int tagDataOffset = qualDataOffset + bAlignment.SupportData.QuerySequenceLength; const unsigned int tagDataLength = dataLength - tagDataOffset; - + // set up char buffers const char* allCharData = bAlignment.SupportData.AllCharData.data(); uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset); const char* seqData = ((const char*)allCharData) + seqDataOffset; const char* qualData = ((const char*)allCharData) + qualDataOffset; char* tagData = ((char*)allCharData) + tagDataOffset; - + // store alignment name (depends on null char as terminator) - bAlignment.Name.assign((const char*)(allCharData)); - - // save CigarOps + bAlignment.Name.assign((const char*)(allCharData)); + + // save CigarOps CigarOp op; bAlignment.CigarData.clear(); bAlignment.CigarData.reserve(bAlignment.SupportData.NumCigarOperations); @@ -220,7 +220,7 @@ bool BamReader::BamReaderPrivate::BuildCharData(BamAlignment& bAlignment) { // swap if necessary if ( IsBigEndian ) { SwapEndian_32(cigarData[i]); } - + // build CigarOp structure op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT); op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ]; @@ -228,8 +228,8 @@ bool BamReader::BamReaderPrivate::BuildCharData(BamAlignment& bAlignment) { // save CigarOp bAlignment.CigarData.push_back(op); } - - + + // save query sequence bAlignment.QueryBases.clear(); bAlignment.QueryBases.reserve(bAlignment.SupportData.QuerySequenceLength); @@ -237,7 +237,7 @@ bool BamReader::BamReaderPrivate::BuildCharData(BamAlignment& bAlignment) { char singleBase = DNA_LOOKUP[ ( ( seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ]; bAlignment.QueryBases.append(1, singleBase); } - + // save qualities, converting from numeric QV to 'FASTQ-style' ASCII character bAlignment.Qualities.clear(); bAlignment.Qualities.reserve(bAlignment.SupportData.QuerySequenceLength); @@ -245,58 +245,58 @@ bool BamReader::BamReaderPrivate::BuildCharData(BamAlignment& bAlignment) { char singleQuality = (char)(qualData[i]+33); bAlignment.Qualities.append(1, singleQuality); } - + // if QueryBases is empty (and this is a allowed case) - if ( bAlignment.QueryBases.empty() ) + if ( bAlignment.QueryBases.empty() ) bAlignment.AlignedBases = bAlignment.QueryBases; - + // if QueryBases contains data, then build AlignedBases using CIGAR data else { - + // resize AlignedBases bAlignment.AlignedBases.clear(); bAlignment.AlignedBases.reserve(bAlignment.SupportData.QuerySequenceLength); - + // iterate over CigarOps int k = 0; vector<CigarOp>::const_iterator cigarIter = bAlignment.CigarData.begin(); vector<CigarOp>::const_iterator cigarEnd = bAlignment.CigarData.end(); for ( ; cigarIter != cigarEnd; ++cigarIter ) { - + const CigarOp& op = (*cigarIter); switch(op.Type) { - + case ('M') : case ('I') : bAlignment.AlignedBases.append(bAlignment.QueryBases.substr(k, op.Length)); // for 'M', 'I' - write bases // fall through - + case ('S') : k += op.Length; // for 'S' - soft clip, skip over query bases break; - + case ('D') : bAlignment.AlignedBases.append(op.Length, '-'); // for 'D' - write gap character break; - + case ('P') : bAlignment.AlignedBases.append( op.Length, '*' ); // for 'P' - write padding character break; - + case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in original query sequence break; - + case ('H') : break; // for 'H' - hard clip, do nothing to AlignedBases, move to next op - + default: printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here exit(1); } } } - + // ----------------------- // Added: 3-25-2010 DB // Fixed: endian-correctness for tag data @@ -304,55 +304,55 @@ bool BamReader::BamReaderPrivate::BuildCharData(BamAlignment& bAlignment) { if ( IsBigEndian ) { int i = 0; while ( (unsigned int)i < tagDataLength ) { - + i += 2; // skip tag type (e.g. "RG", "NM", etc) - uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning + uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning ++i; // skip value type - + switch (type) { - + case('A') : - case('C') : + case('C') : ++i; break; - case('S') : - SwapEndian_16p(&tagData[i]); + case('S') : + SwapEndian_16p(&tagData[i]); i += sizeof(uint16_t); break; - + case('F') : - case('I') : + case('I') : SwapEndian_32p(&tagData[i]); i += sizeof(uint32_t); break; - - case('D') : + + case('D') : SwapEndian_64p(&tagData[i]); i += sizeof(uint64_t); break; - + case('H') : - case('Z') : + case('Z') : while (tagData[i]) { ++i; } ++i; // increment one more for null terminator break; - - default : + + default : printf("ERROR: Invalid tag value type\n"); // shouldn't get here exit(1); } } } - + // store TagData bAlignment.TagData.clear(); bAlignment.TagData.resize(tagDataLength); memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLength); - + // clear the core-only flag bAlignment.SupportData.HasCoreOnly = false; - + // return success return true; } @@ -365,16 +365,16 @@ void BamReader::BamReaderPrivate::ClearIndex(void) { // closes the BAM file void BamReader::BamReaderPrivate::Close(void) { - + // close BGZF file stream mBGZF.Close(); - + // clear out index data ClearIndex(); - + // clear out header data HeaderText.clear(); - + // clear out region flags IsLeftBoundSpecified = false; IsRightBoundSpecified = false; @@ -386,18 +386,18 @@ bool BamReader::BamReaderPrivate::CreateIndex(bool useDefaultIndex) { // clear out prior index data ClearIndex(); - + // create default index if ( useDefaultIndex ) NewIndex = new BamDefaultIndex(&mBGZF, Parent, IsBigEndian); // create BamTools 'custom' index else NewIndex = new BamToolsIndex(&mBGZF, Parent, IsBigEndian); - + bool ok = true; ok &= NewIndex->Build(); - ok &= NewIndex->Write(Filename); - + ok &= NewIndex->Write(Filename); + // return success/fail return ok; } @@ -408,7 +408,7 @@ bool BamReader::BamReaderPrivate::GetNextAlignment(BamAlignment& bAlignment) { // if valid alignment found, attempt to parse char data, and return success/failure if ( GetNextAlignmentCore(bAlignment) ) return BuildCharData(bAlignment); - + // no valid alignment found else return false; @@ -416,7 +416,7 @@ bool BamReader::BamReaderPrivate::GetNextAlignment(BamAlignment& bAlignment) { // retrieves next available alignment core data (returns success/fail) // ** DOES NOT parse any character data (read name, bases, qualities, tag data) -// these can be accessed, if necessary, from the supportData +// these can be accessed, if necessary, from the supportData // useful for operations requiring ONLY positional or other alignment-related information bool BamReader::BamReaderPrivate::GetNextAlignmentCore(BamAlignment& bAlignment) { @@ -425,15 +425,15 @@ bool BamReader::BamReaderPrivate::GetNextAlignmentCore(BamAlignment& bAlignment) // set core-only flag bAlignment.SupportData.HasCoreOnly = true; - + // if region not specified, return success if ( !IsLeftBoundSpecified ) return true; // determine region state (before, within, after) BamReader::BamReaderPrivate::RegionState state = IsOverlap(bAlignment); - + // if alignment lies after region, return false - if ( state == AFTER_REGION ) + if ( state == AFTER_REGION ) return false; while ( state != WITHIN_REGION ) { @@ -442,7 +442,7 @@ bool BamReader::BamReaderPrivate::GetNextAlignmentCore(BamAlignment& bAlignment) // if alignment lies after region, return false (no available read within region) state = IsOverlap(bAlignment); if ( state == AFTER_REGION) return false; - + } // return success (alignment found that overlaps region) @@ -472,30 +472,30 @@ int BamReader::BamReaderPrivate::GetReferenceID(const string& refName) const { // returns region state - whether alignment ends before, overlaps, or starts after currently specified region // this *internal* method should ONLY called when (at least) IsLeftBoundSpecified == true BamReader::BamReaderPrivate::RegionState BamReader::BamReaderPrivate::IsOverlap(BamAlignment& bAlignment) { - + // -------------------------------------------------- // check alignment start against right bound cutoff - + // if full region of interest was given if ( IsRightBoundSpecified ) { - + // read starts on right bound reference, but AFTER right bound position if ( bAlignment.RefID == Region.RightRefID && bAlignment.Position > Region.RightPosition ) return AFTER_REGION; - + // if read starts on reference AFTER right bound, return false - if ( bAlignment.RefID > Region.RightRefID ) + if ( bAlignment.RefID > Region.RightRefID ) return AFTER_REGION; } - + // -------------------------------------------------------- // no right bound given OR read starts before right bound - // so, check if it overlaps left bound - + // so, check if it overlaps left bound + // if read starts on left bound reference AND after left boundary, return success if ( bAlignment.RefID == Region.LeftRefID && bAlignment.Position >= Region.LeftPosition) return WITHIN_REGION; - + // if read is on any reference sequence before left bound, return false if ( bAlignment.RefID < Region.LeftRefID ) return BEFORE_REGION; @@ -515,29 +515,29 @@ BamReader::BamReaderPrivate::RegionState BamReader::BamReaderPrivate::IsOverlap( bool BamReader::BamReaderPrivate::Jump(int refID, int position) { // ----------------------------------------------------------------------- - // check for existing index - if ( NewIndex == 0 ) return false; + // check for existing index + if ( NewIndex == 0 ) return false; // see if reference has alignments - if ( !NewIndex->HasAlignments(refID) ) return false; + if ( !NewIndex->HasAlignments(refID) ) return false; // make sure position is valid if ( position > References.at(refID).RefLength ) return false; - + // determine possible offsets vector<int64_t> offsets; if ( !NewIndex->GetOffsets(Region, IsRightBoundSpecified, offsets) ) { printf("ERROR: Could not jump: unable to calculate offset for specified region.\n"); return false; } - + // iterate through offsets BamAlignment bAlignment; bool result = true; for ( vector<int64_t>::const_iterator o = offsets.begin(); o != offsets.end(); ++o) { - + // attempt seek & load first available alignment result &= mBGZF.Seek(*o); LoadNextAlignment(bAlignment); - + // if this alignment corresponds to desired position // return success of seeking back to 'current offset' if ( (bAlignment.RefID == refID && bAlignment.Position + bAlignment.Length > position) || (bAlignment.RefID > refID) ) { @@ -545,7 +545,7 @@ bool BamReader::BamReaderPrivate::Jump(int refID, int position) { return mBGZF.Seek(*o); } } - + return result; } @@ -568,7 +568,7 @@ void BamReader::BamReaderPrivate::LoadHeaderData(void) { mBGZF.Read(buffer, 4); unsigned int headerTextLength = BgzfData::UnpackUnsignedInt(buffer); if ( IsBigEndian ) { SwapEndian_32(headerTextLength); } - + // get BAM header text char* headerText = (char*)calloc(headerTextLength + 1, 1); mBGZF.Read(headerText, headerTextLength); @@ -591,21 +591,21 @@ bool BamReader::BamReaderPrivate::LoadIndex(void) { // check supplied filename for index type size_t defaultExtensionFound = IndexFilename.find(".bai"); size_t customExtensionFound = IndexFilename.find(".bti"); - + // if SAM/BAM default (".bai") if ( defaultExtensionFound != string::npos ) NewIndex = new BamDefaultIndex(&mBGZF, Parent, IsBigEndian); - + // if BamTools custom index (".bti") else if ( customExtensionFound != string::npos ) NewIndex = new BamToolsIndex(&mBGZF, Parent, IsBigEndian); - + // else unknown else { printf("ERROR: Unknown index file extension.\n"); return false; } - + // return success of loading index data return NewIndex->Load(IndexFilename); } @@ -625,15 +625,15 @@ bool BamReader::BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment) { if ( mBGZF.Read(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; } if ( IsBigEndian ) { - for ( int i = 0; i < BAM_CORE_SIZE; i+=sizeof(uint32_t) ) { - SwapEndian_32p(&x[i]); + for ( int i = 0; i < BAM_CORE_SIZE; i+=sizeof(uint32_t) ) { + SwapEndian_32p(&x[i]); } } - + // set BamAlignment 'core' and 'support' data - bAlignment.RefID = BgzfData::UnpackSignedInt(&x[0]); + bAlignment.RefID = BgzfData::UnpackSignedInt(&x[0]); bAlignment.Position = BgzfData::UnpackSignedInt(&x[4]); - + unsigned int tempValue = BgzfData::UnpackUnsignedInt(&x[8]); bAlignment.Bin = tempValue >> 16; bAlignment.MapQuality = tempValue >> 8 & 0xff; @@ -647,20 +647,20 @@ bool BamReader::BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment) { bAlignment.MateRefID = BgzfData::UnpackSignedInt(&x[20]); bAlignment.MatePosition = BgzfData::UnpackSignedInt(&x[24]); bAlignment.InsertSize = BgzfData::UnpackSignedInt(&x[28]); - + // set BamAlignment length bAlignment.Length = bAlignment.SupportData.QuerySequenceLength; - + // read in character data - make sure proper data size was read bool readCharDataOK = false; const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE; char* allCharData = (char*)calloc(sizeof(char), dataLength); - - if ( mBGZF.Read(allCharData, dataLength) == (signed int)dataLength) { - + + if ( mBGZF.Read(allCharData, dataLength) == (signed int)dataLength) { + // store 'allCharData' in supportData structure bAlignment.SupportData.AllCharData.assign((const char*)allCharData, dataLength); - + // set success flag readCharDataOK = true; } @@ -713,9 +713,9 @@ bool BamReader::BamReaderPrivate::Open(const string& filename, const string& ind IndexFilename = indexFilename; // open the BGZF file for reading, return false on failure - if ( !mBGZF.Open(filename, "rb") ) + if ( !mBGZF.Open(filename, "rb") ) return false; - + // retrieve header text & reference data LoadHeaderData(); LoadReferenceData(); @@ -726,28 +726,28 @@ bool BamReader::BamReaderPrivate::Open(const string& filename, const string& ind // open index file & load index data (if exists) if ( !IndexFilename.empty() ) LoadIndex(); - + // return success return true; } // returns BAM file pointer to beginning of alignment data bool BamReader::BamReaderPrivate::Rewind(void) { - + // rewind to first alignment if ( !mBGZF.Seek(AlignmentsBeginOffset) ) return false; - + // retrieve first alignment data BamAlignment al; if ( !LoadNextAlignment(al) ) return false; - + // reset default region info using first alignment in file Region.LeftRefID = al.RefID; Region.LeftPosition = al.Position; Region.RightRefID = -1; Region.RightPosition = -1; IsLeftBoundSpecified = false; - IsRightBoundSpecified = false; + IsRightBoundSpecified = false; // rewind back to before first alignment // return success/fail of seek @@ -758,16 +758,16 @@ bool BamReader::BamReaderPrivate::Rewind(void) { // attempts a Jump() to left bound as well // returns success/failure of Jump() bool BamReader::BamReaderPrivate::SetRegion(const BamRegion& region) { - + // save region of interest Region = region; - + // set flags - if ( region.LeftRefID >= 0 && region.LeftPosition >= 0 ) + if ( region.LeftRefID >= 0 && region.LeftPosition >= 0 ) IsLeftBoundSpecified = true; - if ( region.RightRefID >= 0 && region.RightPosition >= 0 ) + if ( region.RightRefID >= 0 && region.RightPosition >= 0 ) IsRightBoundSpecified = true; - + // attempt jump to beginning of region, return success/fail of Jump() return Jump( Region.LeftRefID, Region.LeftPosition ); } diff --git a/src/utils/BamTools/BamReader.h b/src/utils/BamTools/BamReader.h index c93987b1a41bfbe41b7f0a10bd9d826a3514e681..a56316efcafd41e7ab33ab07b71316d12d49b158 100644 --- a/src/utils/BamTools/BamReader.h +++ b/src/utils/BamTools/BamReader.h @@ -21,7 +21,7 @@ #include "BamAux.h" namespace BamTools { - + class BamReader { // constructor / destructor @@ -58,10 +58,10 @@ class BamReader { // retrieves next available alignment (returns success/fail) bool GetNextAlignment(BamAlignment& bAlignment); - + // retrieves next available alignment core data (returns success/fail) // ** DOES NOT parse any character data (read name, bases, qualities, tag data) - // these can be accessed, if necessary, from the supportData + // these can be accessed, if necessary, from the supportData // useful for operations requiring ONLY positional or other alignment-related information bool GetNextAlignmentCore(BamAlignment& bAlignment); @@ -86,7 +86,7 @@ class BamReader { // creates index for BAM file, saves to file (default = bamFilename + ".bai") bool CreateIndex(bool useDefaultIndex = true); - + // private implementation private: struct BamReaderPrivate; diff --git a/src/utils/BamTools/BamWriter.cpp b/src/utils/BamTools/BamWriter.cpp index f83ff1c3046ed46876c655b5b11f0535d634bda0..49e223b29e45e6e370fd058794c20bdbc0f630f9 100644 --- a/src/utils/BamTools/BamWriter.cpp +++ b/src/utils/BamTools/BamWriter.cpp @@ -23,12 +23,12 @@ struct BamWriter::BamWriterPrivate { // data members BgzfData mBGZF; bool IsBigEndian; - + // constructor / destructor - BamWriterPrivate(void) { - IsBigEndian = SystemIsBigEndian(); + BamWriterPrivate(void) { + IsBigEndian = SystemIsBigEndian(); } - + ~BamWriterPrivate(void) { mBGZF.Close(); } @@ -60,8 +60,8 @@ BamWriter::~BamWriter(void) { } // closes the alignment archive -void BamWriter::Close(void) { - d->Close(); +void BamWriter::Close(void) { + d->Close(); } // opens the alignment archive @@ -70,7 +70,7 @@ bool BamWriter::Open(const string& filename, const string& samHeader, const RefV } // saves the alignment to the alignment archive -void BamWriter::SaveAlignment(const BamAlignment& al) { +void BamWriter::SaveAlignment(const BamAlignment& al) { d->SaveAlignment(al); } @@ -84,7 +84,7 @@ void BamWriter::BamWriterPrivate::Close(void) { } // calculates minimum bin for a BAM alignment interval -const unsigned int BamWriter::BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const { +const unsigned int BamWriter::BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const { --end; if( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14); if( (begin >> 17) == (end >> 17) ) return 585 + (begin >> 17); @@ -156,31 +156,31 @@ void BamWriter::BamWriterPrivate::EncodeQuerySequence(const string& query, strin while(*pQuery) { switch(*pQuery) { - + case '=': nucleotideCode = 0; break; - + case 'A': nucleotideCode = 1; break; - + case 'C': nucleotideCode = 2; break; - + case 'G': nucleotideCode = 4; break; - + case 'T': nucleotideCode = 8; break; - + case 'N': nucleotideCode = 15; break; - + default: printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery); exit(1); @@ -223,7 +223,7 @@ bool BamWriter::BamWriterPrivate::Open(const string& filename, const string& sam mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT); // write the SAM header text - if(samHeaderLen > 0) + if(samHeaderLen > 0) mBGZF.Write(samHeader.data(), samHeaderLen); // write the number of reference sequences @@ -251,7 +251,7 @@ bool BamWriter::BamWriterPrivate::Open(const string& filename, const string& sam if (IsBigEndian) SwapEndian_32(referenceLength); mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT); } - + // return success return true; } @@ -262,7 +262,7 @@ void BamWriter::BamWriterPrivate::SaveAlignment(const BamAlignment& al) { // if BamAlignment contains only the core data and a raw char data buffer // (as a result of BamReader::GetNextAlignmentCore()) if ( al.SupportData.HasCoreOnly ) { - + // write the block size unsigned int blockSize = al.SupportData.BlockLength; if (IsBigEndian) SwapEndian_32(blockSize); @@ -278,35 +278,35 @@ void BamWriter::BamWriterPrivate::SaveAlignment(const BamAlignment& al) { buffer[5] = al.MateRefID; buffer[6] = al.MatePosition; buffer[7] = al.InsertSize; - + // swap BAM core endian-ness, if necessary - if ( IsBigEndian ) { + if ( IsBigEndian ) { for ( int i = 0; i < 8; ++i ) - SwapEndian_32(buffer[i]); + SwapEndian_32(buffer[i]); } - + // write the BAM core mBGZF.Write((char*)&buffer, BAM_CORE_SIZE); - + // write the raw char data - mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE); + mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE); } - + // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code ) else { - + // calculate char lengths const unsigned int nameLength = al.Name.size() + 1; const unsigned int numCigarOperations = al.CigarData.size(); const unsigned int queryLength = al.QueryBases.size(); const unsigned int tagDataLength = al.TagData.size(); - + // no way to tell if BamAlignment.Bin is already defined (no default, invalid value) // force calculation of Bin before storing const int endPosition = al.GetEndPosition(); const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition); - + // create our packed cigar string string packedCigar; CreatePackedCigar(al.CigarData, packedCigar); @@ -315,8 +315,8 @@ void BamWriter::BamWriterPrivate::SaveAlignment(const BamAlignment& al) { // encode the query string encodedQuery; EncodeQuerySequence(al.QueryBases, encodedQuery); - const unsigned int encodedQueryLength = encodedQuery.size(); - + const unsigned int encodedQueryLength = encodedQuery.size(); + // write the block size const unsigned int dataBlockSize = nameLength + packedCigarLength + encodedQueryLength + queryLength + tagDataLength; unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize; @@ -333,34 +333,34 @@ void BamWriter::BamWriterPrivate::SaveAlignment(const BamAlignment& al) { buffer[5] = al.MateRefID; buffer[6] = al.MatePosition; buffer[7] = al.InsertSize; - + // swap BAM core endian-ness, if necessary - if ( IsBigEndian ) { + if ( IsBigEndian ) { for ( int i = 0; i < 8; ++i ) - SwapEndian_32(buffer[i]); + SwapEndian_32(buffer[i]); } - + // write the BAM core mBGZF.Write((char*)&buffer, BAM_CORE_SIZE); - + // write the query name mBGZF.Write(al.Name.c_str(), nameLength); // write the packed cigar if ( IsBigEndian ) { - + char* cigarData = (char*)calloc(sizeof(char), packedCigarLength); memcpy(cigarData, packedCigar.data(), packedCigarLength); - + for (unsigned int i = 0; i < packedCigarLength; ++i) { if ( IsBigEndian ) - SwapEndian_32p(&cigarData[i]); + SwapEndian_32p(&cigarData[i]); } - + mBGZF.Write(cigarData, packedCigarLength); - free(cigarData); - } - else + free(cigarData); + } + else mBGZF.Write(packedCigar.data(), packedCigarLength); // write the encoded query sequence @@ -369,64 +369,64 @@ void BamWriter::BamWriterPrivate::SaveAlignment(const BamAlignment& al) { // write the base qualities string baseQualities(al.Qualities); char* pBaseQualities = (char*)al.Qualities.data(); - for(unsigned int i = 0; i < queryLength; i++) { - pBaseQualities[i] -= 33; + for(unsigned int i = 0; i < queryLength; i++) { + pBaseQualities[i] -= 33; } mBGZF.Write(pBaseQualities, queryLength); // write the read group tag if ( IsBigEndian ) { - + char* tagData = (char*)calloc(sizeof(char), tagDataLength); memcpy(tagData, al.TagData.data(), tagDataLength); - + int i = 0; while ( (unsigned int)i < tagDataLength ) { - + i += 2; // skip tag type (e.g. "RG", "NM", etc) - uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning + uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning ++i; // skip value type - + switch (type) { - + case('A') : - case('C') : + case('C') : ++i; break; - - case('S') : - SwapEndian_16p(&tagData[i]); + + case('S') : + SwapEndian_16p(&tagData[i]); i+=2; // sizeof(uint16_t) break; - + case('F') : - case('I') : + case('I') : SwapEndian_32p(&tagData[i]); i+=4; // sizeof(uint32_t) break; - - case('D') : + + case('D') : SwapEndian_64p(&tagData[i]); i+=8; // sizeof(uint64_t) break; - + case('H') : - case('Z') : + case('Z') : while (tagData[i]) { ++i; } ++i; // increment one more for null terminator break; - - default : + + default : printf("ERROR: Invalid tag value type\n"); // shouldn't get here free(tagData); - exit(1); + exit(1); } } - + mBGZF.Write(tagData, tagDataLength); free(tagData); - } - else - mBGZF.Write(al.TagData.data(), tagDataLength); + } + else + mBGZF.Write(al.TagData.data(), tagDataLength); } } diff --git a/src/utils/BamTools/BamWriter.h b/src/utils/BamTools/BamWriter.h index b0cb6ceba3457cc58f3f3670522148a394a39a96..20e3ffc2750d266609b3be0d3464bcfa61886a5d 100644 --- a/src/utils/BamTools/BamWriter.h +++ b/src/utils/BamTools/BamWriter.h @@ -34,9 +34,9 @@ class BamWriter { // closes the alignment archive void Close(void); // opens the alignment archive - bool Open(const std::string& filename, - const std::string& samHeader, - const BamTools::RefVector& referenceSequences, + bool Open(const std::string& filename, + const std::string& samHeader, + const BamTools::RefVector& referenceSequences, bool writeUncompressed = false); // saves the alignment to the alignment archive void SaveAlignment(const BamTools::BamAlignment& al); diff --git a/src/utils/bedFile/bedFile.cpp b/src/utils/bedFile/bedFile.cpp index 8b11f46bea5276f44c32bd88b72f84a0cbb10cef..9f243dc3f1d0169321a9b2f36b2de8724c03aeaf 100644 --- a/src/utils/bedFile/bedFile.cpp +++ b/src/utils/bedFile/bedFile.cpp @@ -17,43 +17,43 @@ Helper functions *************************************************/ void splitBedIntoBlocks(const BED &bed, int lineNum, bedVector &bedBlocks) { - if (bed.otherFields.size() < 6) { - cerr << "Input error: Cannot split into blocks. Found interval with fewer than 12 columns on line " << lineNum << "." << endl; - exit(1); - } - - int blockCount = atoi(bed.otherFields[3].c_str()); - if ( blockCount <= 0 ) { - cerr << "Input error: found interval having <= 0 blocks on line " << lineNum << "." << endl; - exit(1); - } - else if ( blockCount == 1 ) { - //take a short-cut for single blocks - bedBlocks.push_back(bed); - } - else { - // get the comma-delimited strings for the BED12 block starts and block ends. - string blockSizes(bed.otherFields[4]); - string blockStarts(bed.otherFields[5]); - - vector<int> sizes; - vector<int> starts; - Tokenize(blockSizes, sizes, ","); - Tokenize(blockStarts, starts, ","); - - if ( sizes.size() != (size_t) blockCount || starts.size() != (size_t) blockCount ) { - cerr << "Input error: found interval with block-counts not matching starts/sizes on line " << lineNum << "." << endl; - exit(1); - } + if (bed.otherFields.size() < 6) { + cerr << "Input error: Cannot split into blocks. Found interval with fewer than 12 columns on line " << lineNum << "." << endl; + exit(1); + } + + int blockCount = atoi(bed.otherFields[3].c_str()); + if ( blockCount <= 0 ) { + cerr << "Input error: found interval having <= 0 blocks on line " << lineNum << "." << endl; + exit(1); + } + else if ( blockCount == 1 ) { + //take a short-cut for single blocks + bedBlocks.push_back(bed); + } + else { + // get the comma-delimited strings for the BED12 block starts and block ends. + string blockSizes(bed.otherFields[4]); + string blockStarts(bed.otherFields[5]); + + vector<int> sizes; + vector<int> starts; + Tokenize(blockSizes, sizes, ","); + Tokenize(blockStarts, starts, ","); + + if ( sizes.size() != (size_t) blockCount || starts.size() != (size_t) blockCount ) { + cerr << "Input error: found interval with block-counts not matching starts/sizes on line " << lineNum << "." << endl; + exit(1); + } // add each BED block to the bedBlocks vector - for (UINT i = 0; i < (UINT) blockCount; ++i) { + for (UINT i = 0; i < (UINT) blockCount; ++i) { CHRPOS blockStart = bed.start + starts[i]; CHRPOS blockEnd = bed.start + starts[i] + sizes[i]; BED currBedBlock(bed.chrom, blockStart, blockEnd, bed.name, bed.score, bed.strand, bed.otherFields); bedBlocks.push_back(currBedBlock); - } - } + } + } } @@ -61,52 +61,52 @@ void splitBedIntoBlocks(const BED &bed, int lineNum, bedVector &bedBlocks) { Sorting comparison functions ************************************************/ bool sortByChrom(BED const &a, BED const &b) { - if (a.chrom < b.chrom) return true; - else return false; + if (a.chrom < b.chrom) return true; + else return false; }; bool sortByStart(const BED &a, const BED &b) { - if (a.start < b.start) return true; - else return false; + if (a.start < b.start) return true; + else return false; }; bool sortBySizeAsc(const BED &a, const BED &b) { - - CHRPOS aLen = a.end - a.start; - CHRPOS bLen = b.end - b.start; - - if (aLen < bLen) return true; - else return false; + + CHRPOS aLen = a.end - a.start; + CHRPOS bLen = b.end - b.start; + + if (aLen < bLen) return true; + else return false; }; bool sortBySizeDesc(const BED &a, const BED &b) { - - CHRPOS aLen = a.end - a.start; - CHRPOS bLen = b.end - b.start; - - if (aLen > bLen) return true; - else return false; + + CHRPOS aLen = a.end - a.start; + CHRPOS bLen = b.end - b.start; + + if (aLen > bLen) return true; + else return false; }; bool sortByScoreAsc(const BED &a, const BED &b) { - if (a.score < b.score) return true; - else return false; + if (a.score < b.score) return true; + else return false; }; bool sortByScoreDesc(const BED &a, const BED &b) { - if (a.score > b.score) return true; - else return false; + if (a.score > b.score) return true; + else return false; }; bool byChromThenStart(BED const &a, BED const &b) { - if (a.chrom < b.chrom) return true; - else if (a.chrom > b.chrom) return false; + if (a.chrom < b.chrom) return true; + else if (a.chrom > b.chrom) return false; - if (a.start < b.start) return true; - else if (a.start >= b.start) return false; + if (a.start < b.start) return true; + else if (a.start >= b.start) return false; - return false; + return false; }; @@ -126,14 +126,14 @@ BedFile::~BedFile(void) { void BedFile::Open(void) { - if (bedFile == "stdin") { - _bedStream = &cin; - } - // New method thanks to Assaf Gordon - else if ((isGzipFile(bedFile) == false) && (isRegularFile(bedFile) == true)) { + if (bedFile == "stdin") { + _bedStream = &cin; + } + // New method thanks to Assaf Gordon + else if ((isGzipFile(bedFile) == false) && (isRegularFile(bedFile) == true)) { // open an ifstream ifstream beds(bedFile.c_str(), ios::in); - + // can we open the file? if ( !beds ) { cerr << "Error: The requested bed file (" << bedFile << ") could not be opened. Exiting!" << endl; @@ -141,23 +141,23 @@ void BedFile::Open(void) { } else { // if so, close it (this was just a test) - beds.close(); + beds.close(); // now set a pointer to the stream so that we _bedStream = new ifstream(bedFile.c_str(), ios::in); } - } - else if ((isGzipFile(bedFile) == true) && (isRegularFile(bedFile) == true)) { - igzstream beds(bedFile.c_str(), ios::in); - if ( !beds ) { - cerr << "Error: The requested bed file (" << bedFile << ") could not be opened. Exiting!" << endl; - exit (1); - } - else { - // if so, close it (this was just a test) - beds.close(); - // now set a pointer to the stream so that we - _bedStream = new igzstream(bedFile.c_str(), ios::in); - } + } + else if ((isGzipFile(bedFile) == true) && (isRegularFile(bedFile) == true)) { + igzstream beds(bedFile.c_str(), ios::in); + if ( !beds ) { + cerr << "Error: The requested bed file (" << bedFile << ") could not be opened. Exiting!" << endl; + exit (1); + } + else { + // if so, close it (this was just a test) + beds.close(); + // now set a pointer to the stream so that we + _bedStream = new igzstream(bedFile.c_str(), ios::in); + } } else { cerr << "Error: Unexpected file type (" << bedFile << "). Exiting!" << endl; @@ -168,55 +168,55 @@ void BedFile::Open(void) { // Close the BED file void BedFile::Close(void) { - if (bedFile != "stdin") delete _bedStream; + if (bedFile != "stdin") delete _bedStream; } BedLineStatus BedFile::GetNextBed(BED &bed, int &lineNum) { - // make sure there are still lines to process. - // if so, tokenize, validate and return the BED entry. - if (_bedStream->good()) { - string bedLine; - vector<string> bedFields; - bedFields.reserve(12); - - // parse the bedStream pointer - getline(*_bedStream, bedLine); - lineNum++; - - // split into a string vector. - Tokenize(bedLine,bedFields); - - // load the BED struct as long as it's a valid BED entry. - return parseLine(bed, bedFields, lineNum); - } - - // default if file is closed or EOF - return BED_INVALID; + // make sure there are still lines to process. + // if so, tokenize, validate and return the BED entry. + if (_bedStream->good()) { + string bedLine; + vector<string> bedFields; + bedFields.reserve(12); + + // parse the bedStream pointer + getline(*_bedStream, bedLine); + lineNum++; + + // split into a string vector. + Tokenize(bedLine,bedFields); + + // load the BED struct as long as it's a valid BED entry. + return parseLine(bed, bedFields, lineNum); + } + + // default if file is closed or EOF + return BED_INVALID; } -void BedFile::FindOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, +void BedFile::FindOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, vector<BED> &hits, bool forceStrand) { - BIN startBin, endBin; - startBin = (start >> _binFirstShift); - endBin = ((end-1) >> _binFirstShift); - - // loop through each bin "level" in the binning hierarchy - for (BINLEVEL i = 0; i < _binLevels; ++i) { - - // loop through each bin at this level of the hierarchy - BIN offset = _binOffsetsExtended[i]; - for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) { - + BIN startBin, endBin; + startBin = (start >> _binFirstShift); + endBin = ((end-1) >> _binFirstShift); + + // loop through each bin "level" in the binning hierarchy + for (BINLEVEL i = 0; i < _binLevels; ++i) { + + // loop through each bin at this level of the hierarchy + BIN offset = _binOffsetsExtended[i]; + for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) { + // loop through each feature in this chrom/bin and see if it overlaps - // with the feature that was passed in. if so, add the feature to + // with the feature that was passed in. if so, add the feature to // the list of hits. vector<BED>::const_iterator bedItr = bedMap[chrom][j].begin(); vector<BED>::const_iterator bedEnd = bedMap[chrom][j].end(); - + for (; bedItr != bedEnd; ++bedItr) { // do we have sufficient overlap? if (overlaps(bedItr->start, bedItr->end, start, end) > 0) { @@ -227,103 +227,103 @@ void BedFile::FindOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, } } } - } - startBin >>= _binNextShift; - endBin >>= _binNextShift; - } + } + startBin >>= _binNextShift; + endBin >>= _binNextShift; + } } -bool BedFile::FindOneOrMoreOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, - bool forceStrand, float overlapFraction) { - - BIN startBin, endBin; - startBin = (start >> _binFirstShift); - endBin = ((end-1) >> _binFirstShift); - - CHRPOS aLength = (end - start); - - // loop through each bin "level" in the binning hierarchy - for (BINLEVEL i = 0; i < _binLevels; ++i) { - - // loop through each bin at this level of the hierarchy - BIN offset = _binOffsetsExtended[i]; - for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) { - - // loop through each feature in this chrom/bin and see if it overlaps - // with the feature that was passed in. if so, add the feature to - // the list of hits. - vector<BED>::const_iterator bedItr = bedMap[chrom][j].begin(); - vector<BED>::const_iterator bedEnd = bedMap[chrom][j].end(); - for (; bedItr != bedEnd; ++bedItr) { - - CHRPOS s = max(start, bedItr->start); - CHRPOS e = min(end, bedItr->end); - // the number of overlapping bases b/w a and b - int overlapBases = (e - s); - - // do we have sufficient overlap? - if ( (float) overlapBases / (float) aLength >= overlapFraction) { - // skip the hit if not on the same strand (and we care) - if (forceStrand == false) return true; - else if ( (forceStrand == true) && (strand == bedItr->strand)) { - return true; - } - } - } - } - startBin >>= _binNextShift; - endBin >>= _binNextShift; - } - return false; +bool BedFile::FindOneOrMoreOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, + bool forceStrand, float overlapFraction) { + + BIN startBin, endBin; + startBin = (start >> _binFirstShift); + endBin = ((end-1) >> _binFirstShift); + + CHRPOS aLength = (end - start); + + // loop through each bin "level" in the binning hierarchy + for (BINLEVEL i = 0; i < _binLevels; ++i) { + + // loop through each bin at this level of the hierarchy + BIN offset = _binOffsetsExtended[i]; + for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) { + + // loop through each feature in this chrom/bin and see if it overlaps + // with the feature that was passed in. if so, add the feature to + // the list of hits. + vector<BED>::const_iterator bedItr = bedMap[chrom][j].begin(); + vector<BED>::const_iterator bedEnd = bedMap[chrom][j].end(); + for (; bedItr != bedEnd; ++bedItr) { + + CHRPOS s = max(start, bedItr->start); + CHRPOS e = min(end, bedItr->end); + // the number of overlapping bases b/w a and b + int overlapBases = (e - s); + + // do we have sufficient overlap? + if ( (float) overlapBases / (float) aLength >= overlapFraction) { + // skip the hit if not on the same strand (and we care) + if (forceStrand == false) return true; + else if ( (forceStrand == true) && (strand == bedItr->strand)) { + return true; + } + } + } + } + startBin >>= _binNextShift; + endBin >>= _binNextShift; + } + return false; } -bool BedFile::FindOneOrMoreReciprocalOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, - bool forceStrand, float overlapFraction) { - - BIN startBin, endBin; - startBin = (start >> _binFirstShift); - endBin = ((end-1) >> _binFirstShift); - - CHRPOS aLength = (end - start); - - // loop through each bin "level" in the binning hierarchy - for (BINLEVEL i = 0; i < _binLevels; ++i) { - - // loop through each bin at this level of the hierarchy - BIN offset = _binOffsetsExtended[i]; - for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) { - - // loop through each feature in this chrom/bin and see if it overlaps - // with the feature that was passed in. if so, add the feature to - // the list of hits. - vector<BED>::const_iterator bedItr = bedMap[chrom][j].begin(); - vector<BED>::const_iterator bedEnd = bedMap[chrom][j].end(); - for (; bedItr != bedEnd; ++bedItr) { - CHRPOS s = max(start, bedItr->start); - CHRPOS e = min(end, bedItr->end); - - // the number of overlapping bases b/w a and b - int overlapBases = (e - s); - - // do we have sufficient overlap? - if ( (float) overlapBases / (float) aLength >= overlapFraction) { - CHRPOS bLength = (bedItr->end - bedItr->start); - float bOverlap = ( (float) overlapBases / (float) bLength ); - if ((forceStrand == false) && (bOverlap >= overlapFraction)) { - return true; - } - else if ( (forceStrand == true) && (strand == bedItr->strand) && (bOverlap >= overlapFraction)) { - return true; - } - } - } - } - startBin >>= _binNextShift; - endBin >>= _binNextShift; - } - return false; +bool BedFile::FindOneOrMoreReciprocalOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, + bool forceStrand, float overlapFraction) { + + BIN startBin, endBin; + startBin = (start >> _binFirstShift); + endBin = ((end-1) >> _binFirstShift); + + CHRPOS aLength = (end - start); + + // loop through each bin "level" in the binning hierarchy + for (BINLEVEL i = 0; i < _binLevels; ++i) { + + // loop through each bin at this level of the hierarchy + BIN offset = _binOffsetsExtended[i]; + for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) { + + // loop through each feature in this chrom/bin and see if it overlaps + // with the feature that was passed in. if so, add the feature to + // the list of hits. + vector<BED>::const_iterator bedItr = bedMap[chrom][j].begin(); + vector<BED>::const_iterator bedEnd = bedMap[chrom][j].end(); + for (; bedItr != bedEnd; ++bedItr) { + CHRPOS s = max(start, bedItr->start); + CHRPOS e = min(end, bedItr->end); + + // the number of overlapping bases b/w a and b + int overlapBases = (e - s); + + // do we have sufficient overlap? + if ( (float) overlapBases / (float) aLength >= overlapFraction) { + CHRPOS bLength = (bedItr->end - bedItr->start); + float bOverlap = ( (float) overlapBases / (float) bLength ); + if ((forceStrand == false) && (bOverlap >= overlapFraction)) { + return true; + } + else if ( (forceStrand == true) && (strand == bedItr->strand) && (bOverlap >= overlapFraction)) { + return true; + } + } + } + } + startBin >>= _binNextShift; + endBin >>= _binNextShift; + } + return false; } @@ -333,18 +333,18 @@ void BedFile::countHits(const BED &a, bool forceStrand) { startBin = (a.start >> _binFirstShift); endBin = ((a.end-1) >> _binFirstShift); - // loop through each bin "level" in the binning hierarchy + // loop through each bin "level" in the binning hierarchy for (BINLEVEL i = 0; i < _binLevels; ++i) { - // loop through each bin at this level of the hierarchy + // loop through each bin at this level of the hierarchy BIN offset = _binOffsetsExtended[i]; for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) { // loop through each feature in this chrom/bin and see if it overlaps - // with the feature that was passed in. if so, add the feature to + // with the feature that was passed in. if so, add the feature to // the list of hits. vector<BEDCOV>::iterator bedItr = bedCovMap[a.chrom][j].begin(); - vector<BEDCOV>::iterator bedEnd = bedCovMap[a.chrom][j].end(); + vector<BEDCOV>::iterator bedEnd = bedCovMap[a.chrom][j].end(); for (; bedItr != bedEnd; ++bedItr) { // skip the hit if not on the same strand (and we care) @@ -359,7 +359,7 @@ void BedFile::countHits(const BED &a, bool forceStrand) { if (a.start < bedItr->minOverlapStart) { bedItr->minOverlapStart = a.start; - } + } } } } @@ -373,29 +373,29 @@ void BedFile::countSplitHits(const vector<BED> &bedBlocks, bool forceStrand) { // set to track the distinct B features that had coverage. // we'll update the counts of coverage for these features by one - // at the end of this function to avoid over-counting. + // at the end of this function to avoid over-counting. set< vector<BEDCOV>::iterator > validHits; - + vector<BED>::const_iterator blockItr = bedBlocks.begin(); - vector<BED>::const_iterator blockEnd = bedBlocks.end(); - for (; blockItr != blockEnd; ++blockItr) { - + vector<BED>::const_iterator blockEnd = bedBlocks.end(); + for (; blockItr != blockEnd; ++blockItr) { + BIN startBin, endBin; startBin = (blockItr->start >> _binFirstShift); endBin = ((blockItr->end-1) >> _binFirstShift); - - // loop through each bin "level" in the binning hierarchy + + // loop through each bin "level" in the binning hierarchy for (BINLEVEL i = 0; i < _binLevels; ++i) { - // loop through each bin at this level of the hierarchy + // loop through each bin at this level of the hierarchy BIN offset = _binOffsetsExtended[i]; for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) { // loop through each feature in this chrom/bin and see if it overlaps - // with the feature that was passed in. if so, add the feature to + // with the feature that was passed in. if so, add the feature to // the list of hits. vector<BEDCOV>::iterator bedItr = bedCovMap[blockItr->chrom][j].begin(); - vector<BEDCOV>::iterator bedEnd = bedCovMap[blockItr->chrom][j].end(); + vector<BEDCOV>::iterator bedEnd = bedCovMap[blockItr->chrom][j].end(); for (; bedItr != bedEnd; ++bedItr) { // skip the hit if not on the same strand (and we care) @@ -407,7 +407,7 @@ void BedFile::countSplitHits(const vector<BED> &bedBlocks, bool forceStrand) { bedItr->depthMap[blockItr->end].ends++; validHits.insert(bedItr); if (blockItr->start < bedItr->minOverlapStart) - bedItr->minOverlapStart = blockItr->start; + bedItr->minOverlapStart = blockItr->start; } } } @@ -433,18 +433,18 @@ void BedFile::countListHits(const BED &a, int index, bool forceStrand) { startBin = (a.start >> _binFirstShift); endBin = ((a.end-1) >> _binFirstShift); - // loop through each bin "level" in the binning hierarchy + // loop through each bin "level" in the binning hierarchy for (BINLEVEL i = 0; i < _binLevels; ++i) { - // loop through each bin at this level of the hierarchy + // loop through each bin at this level of the hierarchy BIN offset = _binOffsetsExtended[i]; for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) { // loop through each feature in this chrom/bin and see if it overlaps - // with the feature that was passed in. if so, add the feature to + // with the feature that was passed in. if so, add the feature to // the list of hits. vector<BEDCOVLIST>::iterator bedItr = bedCovListMap[a.chrom][j].begin(); - vector<BEDCOVLIST>::iterator bedEnd = bedCovListMap[a.chrom][j].end(); + vector<BEDCOVLIST>::iterator bedEnd = bedCovListMap[a.chrom][j].end(); for (; bedItr != bedEnd; ++bedItr) { if (forceStrand && (a.strand != bedItr->strand)) { @@ -457,7 +457,7 @@ void BedFile::countListHits(const BED &a, int index, bool forceStrand) { if (a.start < bedItr->minOverlapStarts[index]) { bedItr->minOverlapStarts[index] = a.start; - } + } } } } @@ -468,14 +468,14 @@ void BedFile::countListHits(const BED &a, int index, bool forceStrand) { void BedFile::setGff (bool gff) { - if (gff == true) this->_isGff = true; - else this->_isGff = false; + if (gff == true) this->_isGff = true; + else this->_isGff = false; } void BedFile::setVcf (bool vcf) { - if (vcf == true) this->_isVcf = true; - else this->_isVcf = false; + if (vcf == true) this->_isVcf = true; + else this->_isVcf = false; } @@ -492,33 +492,33 @@ void BedFile::setBedType (int colNums) { void BedFile::loadBedFileIntoMap() { - BED bedEntry, nullBed; - int lineNum = 0; - BedLineStatus bedStatus; - - Open(); - while ((bedStatus = GetNextBed(bedEntry, lineNum)) != BED_INVALID) { - if (bedStatus == BED_VALID) { - BIN bin = getBin(bedEntry.start, bedEntry.end); - bedMap[bedEntry.chrom][bin].push_back(bedEntry); - bedEntry = nullBed; - } - } - Close(); + BED bedEntry, nullBed; + int lineNum = 0; + BedLineStatus bedStatus; + + Open(); + while ((bedStatus = GetNextBed(bedEntry, lineNum)) != BED_INVALID) { + if (bedStatus == BED_VALID) { + BIN bin = getBin(bedEntry.start, bedEntry.end); + bedMap[bedEntry.chrom][bin].push_back(bedEntry); + bedEntry = nullBed; + } + } + Close(); } void BedFile::loadBedCovFileIntoMap() { - BED bedEntry, nullBed; - int lineNum = 0; - BedLineStatus bedStatus; - - Open(); - while ((bedStatus = GetNextBed(bedEntry, lineNum)) != BED_INVALID) { - if (bedStatus == BED_VALID) { - BIN bin = getBin(bedEntry.start, bedEntry.end); - + BED bedEntry, nullBed; + int lineNum = 0; + BedLineStatus bedStatus; + + Open(); + while ((bedStatus = GetNextBed(bedEntry, lineNum)) != BED_INVALID) { + if (bedStatus == BED_VALID) { + BIN bin = getBin(bedEntry.start, bedEntry.end); + BEDCOV bedCov; bedCov.chrom = bedEntry.chrom; bedCov.start = bedEntry.start; @@ -527,27 +527,27 @@ void BedFile::loadBedCovFileIntoMap() { bedCov.score = bedEntry.score; bedCov.strand = bedEntry.strand; bedCov.otherFields = bedEntry.otherFields; - bedCov.count = 0; - bedCov.minOverlapStart = INT_MAX; - - bedCovMap[bedEntry.chrom][bin].push_back(bedCov); - bedEntry = nullBed; - } - } - Close(); + bedCov.count = 0; + bedCov.minOverlapStart = INT_MAX; + + bedCovMap[bedEntry.chrom][bin].push_back(bedCov); + bedEntry = nullBed; + } + } + Close(); } void BedFile::loadBedCovListFileIntoMap() { - BED bedEntry, nullBed; - int lineNum = 0; - BedLineStatus bedStatus; + BED bedEntry, nullBed; + int lineNum = 0; + BedLineStatus bedStatus; + + Open(); + while ((bedStatus = GetNextBed(bedEntry, lineNum)) != BED_INVALID) { + if (bedStatus == BED_VALID) { + BIN bin = getBin(bedEntry.start, bedEntry.end); - Open(); - while ((bedStatus = GetNextBed(bedEntry, lineNum)) != BED_INVALID) { - if (bedStatus == BED_VALID) { - BIN bin = getBin(bedEntry.start, bedEntry.end); - BEDCOVLIST bedCovList; bedCovList.chrom = bedEntry.chrom; bedCovList.start = bedEntry.start; @@ -557,32 +557,32 @@ void BedFile::loadBedCovListFileIntoMap() { bedCovList.strand = bedEntry.strand; bedCovList.otherFields = bedEntry.otherFields; - bedCovListMap[bedEntry.chrom][bin].push_back(bedCovList); - bedEntry = nullBed; - } - } - Close(); + bedCovListMap[bedEntry.chrom][bin].push_back(bedCovList); + bedEntry = nullBed; + } + } + Close(); } void BedFile::loadBedFileIntoMapNoBin() { - - BED bedEntry, nullBed; - int lineNum = 0; - BedLineStatus bedStatus; - - Open(); - while ((bedStatus = this->GetNextBed(bedEntry, lineNum)) != BED_INVALID) { - if (bedStatus == BED_VALID) { - bedMapNoBin[bedEntry.chrom].push_back(bedEntry); - bedEntry = nullBed; - } - } - Close(); - - // sort the BED entries for each chromosome - // in ascending order of start position - for (masterBedMapNoBin::iterator m = this->bedMapNoBin.begin(); m != this->bedMapNoBin.end(); ++m) { - sort(m->second.begin(), m->second.end(), sortByStart); - } + + BED bedEntry, nullBed; + int lineNum = 0; + BedLineStatus bedStatus; + + Open(); + while ((bedStatus = this->GetNextBed(bedEntry, lineNum)) != BED_INVALID) { + if (bedStatus == BED_VALID) { + bedMapNoBin[bedEntry.chrom].push_back(bedEntry); + bedEntry = nullBed; + } + } + Close(); + + // sort the BED entries for each chromosome + // in ascending order of start position + for (masterBedMapNoBin::iterator m = this->bedMapNoBin.begin(); m != this->bedMapNoBin.end(); ++m) { + sort(m->second.begin(), m->second.end(), sortByStart); + } } diff --git a/src/utils/bedFile/bedFile.h b/src/utils/bedFile/bedFile.h index 2d5d8886eb7946ed0d9f60135fe928ddca1819ad..ddf29d3d457ff5585a7955588770ab02f39d6338 100644 --- a/src/utils/bedFile/bedFile.h +++ b/src/utils/bedFile/bedFile.h @@ -50,7 +50,7 @@ typedef uint32_t UINT; const BIN _numBins = 37450; const BINLEVEL _binLevels = 7; -// bins range in size from 16kb to 512Mb +// bins range in size from 16kb to 512Mb // Bin 0 spans 512Mbp, # Level 1 // Bins 1-8 span 64Mbp, # Level 2 // Bins 9-72 span 8Mbp, # Level 3 @@ -59,9 +59,9 @@ const BINLEVEL _binLevels = 7; // Bins 4681-37449 span 16Kbp # Level 6 const BIN _binOffsetsExtended[] = {32678+4096+512+64+8+1, 4096+512+64+8+1, 512+64+8+1, 64+8+1, 8+1, 1, 0}; //const BIN _binOffsetsExtended[] = {4096+512+64+8+1, 4096+512+64+8+1, 512+64+8+1, 64+8+1, 8+1, 1, 0}; - -const USHORT _binFirstShift = 14; /* How much to shift to get to finest bin. */ -const USHORT _binNextShift = 3; /* How much to shift to get to next larger bin. */ + +const USHORT _binFirstShift = 14; /* How much to shift to get to finest bin. */ +const USHORT _binNextShift = 3; /* How much to shift to get to next larger bin. */ //************************************************* @@ -69,39 +69,39 @@ const USHORT _binNextShift = 3; /* How much to shift to get to next larger bin //************************************************* struct DEPTH { - UINT starts; - UINT ends; + UINT starts; + UINT ends; }; /* - Structure for regular BED records + Structure for regular BED records */ struct BED { - // Regular BED fields - string chrom; - CHRPOS start; - CHRPOS end; - string name; - string score; - string strand; - - // Add'l fields for BED12 and/or custom BED annotations - vector<string> otherFields; - - // experimental fields for the FJOIN approach. - bool added; - bool finished; - // list of hits from another file. + // Regular BED fields + string chrom; + CHRPOS start; + CHRPOS end; + string name; + string score; + string strand; + + // Add'l fields for BED12 and/or custom BED annotations + vector<string> otherFields; + + // experimental fields for the FJOIN approach. + bool added; + bool finished; + // list of hits from another file. vector<BED> overlaps; - + public: // constructors // Null - BED() - : chrom(""), + BED() + : chrom(""), start(0), end(0), name(""), @@ -109,40 +109,40 @@ public: strand(""), otherFields(), added(false), - finished(false), + finished(false), overlaps() {} - + // BED3 - BED(string chrom, CHRPOS start, CHRPOS end) - : chrom(chrom), + BED(string chrom, CHRPOS start, CHRPOS end) + : chrom(chrom), start(start), end(end) {} // BED4 - BED(string chrom, CHRPOS start, CHRPOS end, string strand) - : chrom(chrom), + BED(string chrom, CHRPOS start, CHRPOS end, string strand) + : chrom(chrom), start(start), end(end), strand(strand) {} // BED6 - BED(string chrom, CHRPOS start, CHRPOS end, string name, - string score, string strand) - : chrom(chrom), + BED(string chrom, CHRPOS start, CHRPOS end, string name, + string score, string strand) + : chrom(chrom), start(start), end(end), name(name), score(score), strand(strand) {} - + // BEDALL - BED(string chrom, CHRPOS start, CHRPOS end, string name, - string score, string strand, vector<string> otherFields) - : chrom(chrom), + BED(string chrom, CHRPOS start, CHRPOS end, string name, + string score, string strand, vector<string> otherFields) + : chrom(chrom), start(start), end(end), name(name), @@ -150,13 +150,13 @@ public: strand(strand), otherFields(otherFields) {} - + }; // BED /* - Structure for each end of a paired BED record - mate points to the other end. + Structure for each end of a paired BED record + mate points to the other end. */ struct MATE { BED bed; @@ -166,21 +166,21 @@ struct MATE { /* - Structure for regular BED COVERAGE records + Structure for regular BED COVERAGE records */ struct BEDCOV { - // Regular BED fields - CHRPOS start; - CHRPOS end; - - string chrom; - string name; - string score; - string strand; + // Regular BED fields + CHRPOS start; + CHRPOS end; + + string chrom; + string name; + string score; + string strand; - // Add'l fields for BED12 and/or custom BED annotations - vector<string> otherFields; + // Add'l fields for BED12 and/or custom BED annotations + vector<string> otherFields; // Additional fields specific to computing coverage map<unsigned int, DEPTH> depthMap; @@ -190,22 +190,22 @@ struct BEDCOV { /* - Structure for BED COVERAGE records having lists of - multiple coverages + Structure for BED COVERAGE records having lists of + multiple coverages */ struct BEDCOVLIST { - // Regular BED fields - CHRPOS start; - CHRPOS end; - - string chrom; - string name; - string score; - string strand; + // Regular BED fields + CHRPOS start; + CHRPOS end; - // Add'l fields for BED12 and/or custom BED annotations - vector<string> otherFields; + string chrom; + string name; + string score; + string strand; + + // Add'l fields for BED12 and/or custom BED annotations + vector<string> otherFields; // Additional fields specific to computing coverage vector< map<unsigned int, DEPTH> > depthMapList; @@ -216,7 +216,7 @@ struct BEDCOVLIST { // enum to flag the state of a given line in a BED file. enum BedLineStatus -{ +{ BED_INVALID = -1, BED_HEADER = 0, BED_BLANK = 1, @@ -225,7 +225,7 @@ enum BedLineStatus // enum to indicate the type of file we are dealing with enum FileType -{ +{ BED_FILETYPE, GFF_FILETYPE, VCF_FILETYPE @@ -254,10 +254,10 @@ typedef map<string, bedVector, std::less<string> > masterBedMapNoBin; // EXPERIMENTAL - wait for TR1 // typedef vector<BED> bedVector; // typedef vector<BEDCOV> bedCovVector; -// +// // typedef tr1::unordered_map<BIN, bedVector> binsToBeds; // typedef tr1::unordered_map<BIN, bedCovVector> binsToBedCovs; -// +// // typedef tr1::unordered_map<string, binsToBeds> masterBedMap; // typedef tr1::unordered_map<string, binsToBedCovs> masterBedCovMap; // typedef tr1::unordered_map<string, bedVector> masterBedMapNoBin; @@ -270,7 +270,7 @@ BIN getBin(CHRPOS start, CHRPOS end) { --end; start >>= _binFirstShift; end >>= _binFirstShift; - + for (register short i = 0; i < _binLevels; ++i) { if (start == end) return _binOffsetsExtended[i] + start; start >>= _binNextShift; @@ -287,16 +287,16 @@ inline bool isInteger(const std::string& s) { int len = s.length(); for (int i = 0; i < len; i++) { if (!std::isdigit(s[i])) return false; - } + } return true; } -// return the amount of overlap between two features. Negative if none and the the +// return the amount of overlap between two features. Negative if none and the the // number of negative bases is the distance between the two. -inline +inline int overlaps(CHRPOS aS, CHRPOS aE, CHRPOS bS, CHRPOS bE) { - return min(aE, bE) - max(aS, bS); + return min(aE, bE) - max(aS, bS); } @@ -304,8 +304,8 @@ int overlaps(CHRPOS aS, CHRPOS aE, CHRPOS bS, CHRPOS bE) { void splitBedIntoBlocks(const BED &bed, int lineNum, bedVector &bedBlocks); -// BED Sorting Methods -bool sortByChrom(const BED &a, const BED &b); +// BED Sorting Methods +bool sortByChrom(const BED &a, const BED &b); bool sortByStart(const BED &a, const BED &b); bool sortBySizeAsc(const BED &a, const BED &b); bool sortBySizeDesc(const BED &a, const BED &b); @@ -322,89 +322,89 @@ class BedFile { public: - // Constructor - BedFile(string &); - - // Destructor - ~BedFile(void); - - // Open a BED file for reading (creates an istream pointer) - void Open(void); - - // Close an opened BED file. - void Close(void); - - // Get the next BED entry in an opened BED file. - BedLineStatus GetNextBed (BED &bed, int &lineNum); - - // load a BED file into a map keyed by chrom, then bin. value is vector of BEDs - void loadBedFileIntoMap(); - - // load a BED file into a map keyed by chrom, then bin. value is vector of BEDCOVs - void loadBedCovFileIntoMap(); - - // load a BED file into a map keyed by chrom, then bin. value is vector of BEDCOVLISTs - void loadBedCovListFileIntoMap(); - - // load a BED file into a map keyed by chrom. value is vector of BEDs - void loadBedFileIntoMapNoBin(); - - // Given a chrom, start, end and strand for a single feature, - // search for all overlapping features in another BED file. - // Searches through each relevant genome bin on the same chromosome - // as the single feature. Note: Adapted from kent source "binKeeperFind" - void FindOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, vector<BED> &hits, bool forceStrand); - - // return true if at least one overlap was found. otherwise, return false. - bool FindOneOrMoreOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, - bool forceStrand, float overlapFraction = 0.0); - - // return true if at least one __reciprocal__ overlap was found. otherwise, return false. - bool FindOneOrMoreReciprocalOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, - bool forceStrand, float overlapFraction = 0.0); - - // Given a chrom, start, end and strand for a single feature, - // increment a the number of hits for each feature in B file - // that the feature overlaps - void countHits(const BED &a, bool forceStrand); - - // same as above, but has special logic that processes a set of - // BED "blocks" from a single entry so as to avoid over-counting - // each "block" of a single BAM/BED12 as distinct coverage. That is, - // if one read has four block, we only want to count the coverage as - // coming from one read, not four. + // Constructor + BedFile(string &); + + // Destructor + ~BedFile(void); + + // Open a BED file for reading (creates an istream pointer) + void Open(void); + + // Close an opened BED file. + void Close(void); + + // Get the next BED entry in an opened BED file. + BedLineStatus GetNextBed (BED &bed, int &lineNum); + + // load a BED file into a map keyed by chrom, then bin. value is vector of BEDs + void loadBedFileIntoMap(); + + // load a BED file into a map keyed by chrom, then bin. value is vector of BEDCOVs + void loadBedCovFileIntoMap(); + + // load a BED file into a map keyed by chrom, then bin. value is vector of BEDCOVLISTs + void loadBedCovListFileIntoMap(); + + // load a BED file into a map keyed by chrom. value is vector of BEDs + void loadBedFileIntoMapNoBin(); + + // Given a chrom, start, end and strand for a single feature, + // search for all overlapping features in another BED file. + // Searches through each relevant genome bin on the same chromosome + // as the single feature. Note: Adapted from kent source "binKeeperFind" + void FindOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, vector<BED> &hits, bool forceStrand); + + // return true if at least one overlap was found. otherwise, return false. + bool FindOneOrMoreOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, + bool forceStrand, float overlapFraction = 0.0); + + // return true if at least one __reciprocal__ overlap was found. otherwise, return false. + bool FindOneOrMoreReciprocalOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, + bool forceStrand, float overlapFraction = 0.0); + + // Given a chrom, start, end and strand for a single feature, + // increment a the number of hits for each feature in B file + // that the feature overlaps + void countHits(const BED &a, bool forceStrand); + + // same as above, but has special logic that processes a set of + // BED "blocks" from a single entry so as to avoid over-counting + // each "block" of a single BAM/BED12 as distinct coverage. That is, + // if one read has four block, we only want to count the coverage as + // coming from one read, not four. void countSplitHits(const vector<BED> &bedBlock, bool forceStrand); - // Given a chrom, start, end and strand for a single feature, - // increment a the number of hits for each feature in B file - // that the feature overlaps + // Given a chrom, start, end and strand for a single feature, + // increment a the number of hits for each feature in B file + // that the feature overlaps void countListHits(const BED &a, int index, bool forceStrand); - - // the bedfile with which this instance is associated - string bedFile; - unsigned int bedType; // 3-6, 12 for BED - // 9 for GFF - - // Main data structires used by BEDTools + + // the bedfile with which this instance is associated + string bedFile; + unsigned int bedType; // 3-6, 12 for BED + // 9 for GFF + + // Main data structires used by BEDTools masterBedCovMap bedCovMap; masterBedCovListMap bedCovListMap; - masterBedMap bedMap; - masterBedMapNoBin bedMapNoBin; - + masterBedMap bedMap; + masterBedMapNoBin bedMapNoBin; + private: - - // data - bool _isGff; - bool _isVcf; + + // data + bool _isGff; + bool _isVcf; bool _typeIsKnown; // do we know the type? (i.e., BED, GFF, VCF) - FileType _fileType; // what is the file type? (BED? GFF? VCF?) - istream *_bedStream; + FileType _fileType; // what is the file type? (BED? GFF? VCF?) + istream *_bedStream; void setGff (bool isGff); void setVcf (bool isVcf); void setFileType (FileType type); void setBedType (int colNums); - + /****************************************************** Private definitions to circumvent linker issues with templated member functions. @@ -416,19 +416,19 @@ private: template <typename T> inline BedLineStatus parseLine (T &bed, const vector<string> &lineVector, int &lineNum) { - //char *p2End, *p3End, *p4End, *p5End; - //long l2, l3, l4, l5; + //char *p2End, *p3End, *p4End, *p5End; + //long l2, l3, l4, l5; unsigned int numFields = lineVector.size(); - - // bail out if we have a blank line - if (numFields == 0) { - return BED_BLANK; - } - if ((lineVector[0].find("track") == string::npos) && (lineVector[0].find("browser") == string::npos) && (lineVector[0].find("#") == string::npos) ) { + // bail out if we have a blank line + if (numFields == 0) { + return BED_BLANK; + } - if (numFields >= 3) { - // line parsing for all lines after the first non-header line + if ((lineVector[0].find("track") == string::npos) && (lineVector[0].find("browser") == string::npos) && (lineVector[0].find("#") == string::npos) ) { + + if (numFields >= 3) { + // line parsing for all lines after the first non-header line if (_typeIsKnown == true) { switch(_fileType) { case BED_FILETYPE: @@ -442,48 +442,48 @@ private: exit(1); } } - // line parsing for first non-header line: figure out file contents + // line parsing for first non-header line: figure out file contents else { - // it's BED format if columns 2 and 3 are integers - if (isInteger(lineVector[1]) && isInteger(lineVector[2])) { - setGff(false); - setFileType(BED_FILETYPE); + // it's BED format if columns 2 and 3 are integers + if (isInteger(lineVector[1]) && isInteger(lineVector[2])) { + setGff(false); + setFileType(BED_FILETYPE); setBedType(numFields); // we now expect numFields columns in each line - if (parseBedLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID; - } - // it's VCF, assuming the second column is numeric and there are at least 8 fields. - else if (isInteger(lineVector[1]) && numFields >= 8) { + if (parseBedLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID; + } + // it's VCF, assuming the second column is numeric and there are at least 8 fields. + else if (isInteger(lineVector[1]) && numFields >= 8) { setGff(false); setVcf(true); setFileType(VCF_FILETYPE); setBedType(numFields); // we now expect numFields columns in each line if (parseVcfLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID; - } - // it's GFF, assuming columns columns 4 and 5 are numeric and we have 9 fields total. - else if ((numFields == 9) && isInteger(lineVector[3]) && isInteger(lineVector[4])) { - setGff(true); - setFileType(GFF_FILETYPE); - setBedType(numFields); // we now expect numFields columns in each line - if (parseGffLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID; - } - else { - cerr << "Unexpected file format. Please use tab-delimited BED, GFF, or VCF. " << - "Perhaps you have non-integer starts or ends at line " << lineNum << "?" << endl; - exit(1); - } - } - } - else { - cerr << "It looks as though you have less than 3 columns at line: " << lineNum << ". Are you sure your files are tab-delimited?" << endl; - exit(1); - } - } - else { - lineNum--; - return BED_HEADER; - } - // default - return BED_INVALID; + } + // it's GFF, assuming columns columns 4 and 5 are numeric and we have 9 fields total. + else if ((numFields == 9) && isInteger(lineVector[3]) && isInteger(lineVector[4])) { + setGff(true); + setFileType(GFF_FILETYPE); + setBedType(numFields); // we now expect numFields columns in each line + if (parseGffLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID; + } + else { + cerr << "Unexpected file format. Please use tab-delimited BED, GFF, or VCF. " << + "Perhaps you have non-integer starts or ends at line " << lineNum << "?" << endl; + exit(1); + } + } + } + else { + cerr << "It looks as though you have less than 3 columns at line: " << lineNum << ". Are you sure your files are tab-delimited?" << endl; + exit(1); + } + } + else { + lineNum--; + return BED_HEADER; + } + // default + return BED_INVALID; } @@ -493,13 +493,13 @@ private: template <typename T> inline bool parseBedLine (T &bed, const vector<string> &lineVector, int lineNum, unsigned int numFields) { - // process as long as the number of fields in this + // process as long as the number of fields in this // line matches what we expect for this file. - if (numFields == this->bedType) { + if (numFields == this->bedType) { bed.chrom = lineVector[0]; bed.start = atoi(lineVector[1].c_str()); bed.end = atoi(lineVector[2].c_str()); - + if (this->bedType == 4) { bed.name = lineVector[3]; } @@ -515,23 +515,23 @@ private: else if (this->bedType > 6) { bed.name = lineVector[3]; bed.score = lineVector[4]; - bed.strand = lineVector[5]; + bed.strand = lineVector[5]; for (unsigned int i = 6; i < lineVector.size(); ++i) { - bed.otherFields.push_back(lineVector[i]); + bed.otherFields.push_back(lineVector[i]); } } else if (this->bedType != 3) { - cerr << "Error: unexpected number of fields at line: " << lineNum - << ". Verify that your files are TAB-delimited. Exiting..." << endl; + cerr << "Error: unexpected number of fields at line: " << lineNum + << ". Verify that your files are TAB-delimited. Exiting..." << endl; exit(1); } - + // sanity checks. if ((bed.start <= bed.end) && (bed.start >= 0) && (bed.end >= 0)) { return true; } else if (bed.start > bed.end) { - cerr << "Error: malformed BED entry at line " << lineNum << ". Start was greater than end. Exiting." << endl; + cerr << "Error: malformed BED entry at line " << lineNum << ". Start was greater than end. Exiting." << endl; exit(1); } else if ( (bed.start < 0) || (bed.end < 0) ) { @@ -539,19 +539,19 @@ private: exit(1); } } - else if (numFields == 1) { - cerr << "Only one BED field detected: " << lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl; - exit(1); - } - else if ((numFields != this->bedType) && (numFields != 0)) { - cerr << "Differing number of BED fields encountered at line: " << lineNum << ". Exiting..." << endl; - exit(1); - } - else if ((numFields < 3) && (numFields != 0)) { - cerr << "TAB delimited BED file with at least 3 fields (chrom, start, end) is required at line: "<< lineNum << ". Exiting..." << endl; - exit(1); - } - return false; + else if (numFields == 1) { + cerr << "Only one BED field detected: " << lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl; + exit(1); + } + else if ((numFields != this->bedType) && (numFields != 0)) { + cerr << "Differing number of BED fields encountered at line: " << lineNum << ". Exiting..." << endl; + exit(1); + } + else if ((numFields < 3) && (numFields != 0)) { + cerr << "TAB delimited BED file with at least 3 fields (chrom, start, end) is required at line: "<< lineNum << ". Exiting..." << endl; + exit(1); + } + return false; } @@ -560,48 +560,48 @@ private: */ template <typename T> inline bool parseVcfLine (T &bed, const vector<string> &lineVector, int lineNum, unsigned int numFields) { - if (numFields == this->bedType) { - bed.chrom = lineVector[0]; - bed.start = atoi(lineVector[1].c_str()) - 1; // VCF is one-based + if (numFields == this->bedType) { + bed.chrom = lineVector[0]; + bed.start = atoi(lineVector[1].c_str()) - 1; // VCF is one-based bed.end = bed.start + lineVector[3].size(); // VCF 4.0 stores the size of the affected REF allele. bed.strand = "+"; - // construct the name from the ref and alt alleles. + // construct the name from the ref and alt alleles. // if it's an annotated variant, add the rsId as well. bed.name = lineVector[3] + "/" + lineVector[4]; if (lineVector[2] != ".") { bed.name += "_" + lineVector[2]; } - if (this->bedType > 2) { - for (unsigned int i = 2; i < numFields; ++i) - bed.otherFields.push_back(lineVector[i]); - } + if (this->bedType > 2) { + for (unsigned int i = 2; i < numFields; ++i) + bed.otherFields.push_back(lineVector[i]); + } - if ((bed.start <= bed.end) && (bed.start > 0) && (bed.end > 0)) { + if ((bed.start <= bed.end) && (bed.start > 0) && (bed.end > 0)) { return true; - } - else if (bed.start > bed.end) { - cerr << "Error: malformed VCF entry at line " << lineNum << ". Start was greater than end. Exiting." << endl; - exit(1); - } - else if ( (bed.start < 0) || (bed.end < 0) ) { - cerr << "Error: malformed VCF entry at line " << lineNum << ". Coordinate detected that is < 0. Exiting." << endl; - exit(1); - } - } - else if (numFields == 1) { - cerr << "Only one VCF field detected: " << lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl; - exit(1); - } - else if ((numFields != this->bedType) && (numFields != 0)) { - cerr << "Differing number of VCF fields encountered at line: " << lineNum << ". Exiting..." << endl; - exit(1); - } - else if ((numFields < 2) && (numFields != 0)) { - cerr << "TAB delimited VCF file with at least 2 fields (chrom, pos) is required at line: "<< lineNum << ". Exiting..." << endl; + } + else if (bed.start > bed.end) { + cerr << "Error: malformed VCF entry at line " << lineNum << ". Start was greater than end. Exiting." << endl; + exit(1); + } + else if ( (bed.start < 0) || (bed.end < 0) ) { + cerr << "Error: malformed VCF entry at line " << lineNum << ". Coordinate detected that is < 0. Exiting." << endl; + exit(1); + } + } + else if (numFields == 1) { + cerr << "Only one VCF field detected: " << lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl; + exit(1); + } + else if ((numFields != this->bedType) && (numFields != 0)) { + cerr << "Differing number of VCF fields encountered at line: " << lineNum << ". Exiting..." << endl; exit(1); - } - return false; + } + else if ((numFields < 2) && (numFields != 0)) { + cerr << "TAB delimited VCF file with at least 2 fields (chrom, pos) is required at line: "<< lineNum << ". Exiting..." << endl; + exit(1); + } + return false; } @@ -611,348 +611,348 @@ private: */ template <typename T> inline bool parseGffLine (T &bed, const vector<string> &lineVector, int lineNum, unsigned int numFields) { - if (numFields == this->bedType) { - if (this->bedType == 9 && _isGff) { - bed.chrom = lineVector[0]; - // substract 1 to force the start to be BED-style - bed.start = atoi(lineVector[3].c_str()) - 1; - bed.end = atoi(lineVector[4].c_str()); - bed.name = lineVector[2]; - bed.score = lineVector[5]; - bed.strand = lineVector[6].c_str(); - bed.otherFields.push_back(lineVector[1]); // add GFF "source". unused in BED - bed.otherFields.push_back(lineVector[7]); // add GFF "fname". unused in BED - bed.otherFields.push_back(lineVector[8]); // add GFF "group". unused in BED - } - else { - cerr << "Error: unexpected number of fields at line: " << lineNum << - ". Verify that your files are TAB-delimited and that your GFF file has 9 fields. Exiting..." << endl; - exit(1); - } - if (bed.start > bed.end) { - cerr << "Error: malformed GFF entry at line " << lineNum << ". Start was greater than end. Exiting." << endl; - exit(1); - } - else if ( (bed.start < 0) || (bed.end < 0) ) { - cerr << "Error: malformed GFF entry at line " << lineNum << ". Coordinate detected that is < 1. Exiting." << endl; - exit(1); - } - else return true; - } - else if (numFields == 1) { - cerr << "Only one GFF field detected: " << lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl; - exit(1); - } - else if ((numFields != this->bedType) && (numFields != 0)) { - cerr << "Differing number of GFF fields encountered at line: " << lineNum << ". Exiting..." << endl; - exit(1); - } - else if ((numFields < 9) && (numFields != 0)) { - cerr << "TAB delimited GFF file with 9 fields is required at line: "<< lineNum << ". Exiting..." << endl; - exit(1); - } - return false; + if (numFields == this->bedType) { + if (this->bedType == 9 && _isGff) { + bed.chrom = lineVector[0]; + // substract 1 to force the start to be BED-style + bed.start = atoi(lineVector[3].c_str()) - 1; + bed.end = atoi(lineVector[4].c_str()); + bed.name = lineVector[2]; + bed.score = lineVector[5]; + bed.strand = lineVector[6].c_str(); + bed.otherFields.push_back(lineVector[1]); // add GFF "source". unused in BED + bed.otherFields.push_back(lineVector[7]); // add GFF "fname". unused in BED + bed.otherFields.push_back(lineVector[8]); // add GFF "group". unused in BED + } + else { + cerr << "Error: unexpected number of fields at line: " << lineNum << + ". Verify that your files are TAB-delimited and that your GFF file has 9 fields. Exiting..." << endl; + exit(1); + } + if (bed.start > bed.end) { + cerr << "Error: malformed GFF entry at line " << lineNum << ". Start was greater than end. Exiting." << endl; + exit(1); + } + else if ( (bed.start < 0) || (bed.end < 0) ) { + cerr << "Error: malformed GFF entry at line " << lineNum << ". Coordinate detected that is < 1. Exiting." << endl; + exit(1); + } + else return true; + } + else if (numFields == 1) { + cerr << "Only one GFF field detected: " << lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl; + exit(1); + } + else if ((numFields != this->bedType) && (numFields != 0)) { + cerr << "Differing number of GFF fields encountered at line: " << lineNum << ". Exiting..." << endl; + exit(1); + } + else if ((numFields < 9) && (numFields != 0)) { + cerr << "TAB delimited GFF file with 9 fields is required at line: "<< lineNum << ". Exiting..." << endl; + exit(1); + } + return false; } - - + + public: - + /* - reportBedTab + reportBedTab - Writes the _original_ BED entry with a TAB - at the end of the line. - Works for BED3 - BED6. + Writes the _original_ BED entry with a TAB + at the end of the line. + Works for BED3 - BED6. */ template <typename T> inline void reportBedTab(const T &bed) { // BED - if (_isGff == false && _isVcf == false) { - if (this->bedType == 3) { - printf ("%s\t%d\t%d\t", bed.chrom.c_str(), bed.start, bed.end); - } - else if (this->bedType == 4) { - printf ("%s\t%d\t%d\t%s\t", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str()); - } - else if (this->bedType == 5) { - printf ("%s\t%d\t%d\t%s\t%s\t", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str(), - bed.score.c_str()); - } - else if (this->bedType == 6) { - printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str(), - bed.score.c_str(), bed.strand.c_str()); - } - else if (this->bedType > 6) { - printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str(), - bed.score.c_str(), bed.strand.c_str()); - - vector<string>::const_iterator othIt = bed.otherFields.begin(); - vector<string>::const_iterator othEnd = bed.otherFields.end(); - for ( ; othIt != othEnd; ++othIt) { - printf("%s\t", othIt->c_str()); - } - } - } - // VCF - else if (_isGff == false && _isVcf == true) { - printf ("%s\t%d\t", bed.chrom.c_str(), bed.start+1); - - vector<string>::const_iterator othIt = bed.otherFields.begin(); - vector<string>::const_iterator othEnd = bed.otherFields.end(); - for ( ; othIt != othEnd; ++othIt) { - printf("%s\t", othIt->c_str()); - } - } - // GFF - else if (this->bedType == 9) { - printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(), - bed.name.c_str(), bed.start+1, bed.end, - bed.score.c_str(), bed.strand.c_str(), - bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); - } + if (_isGff == false && _isVcf == false) { + if (this->bedType == 3) { + printf ("%s\t%d\t%d\t", bed.chrom.c_str(), bed.start, bed.end); + } + else if (this->bedType == 4) { + printf ("%s\t%d\t%d\t%s\t", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str()); + } + else if (this->bedType == 5) { + printf ("%s\t%d\t%d\t%s\t%s\t", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str(), + bed.score.c_str()); + } + else if (this->bedType == 6) { + printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str(), + bed.score.c_str(), bed.strand.c_str()); + } + else if (this->bedType > 6) { + printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str(), + bed.score.c_str(), bed.strand.c_str()); + + vector<string>::const_iterator othIt = bed.otherFields.begin(); + vector<string>::const_iterator othEnd = bed.otherFields.end(); + for ( ; othIt != othEnd; ++othIt) { + printf("%s\t", othIt->c_str()); + } + } + } + // VCF + else if (_isGff == false && _isVcf == true) { + printf ("%s\t%d\t", bed.chrom.c_str(), bed.start+1); + + vector<string>::const_iterator othIt = bed.otherFields.begin(); + vector<string>::const_iterator othEnd = bed.otherFields.end(); + for ( ; othIt != othEnd; ++othIt) { + printf("%s\t", othIt->c_str()); + } + } + // GFF + else if (this->bedType == 9) { + printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(), + bed.name.c_str(), bed.start+1, bed.end, + bed.score.c_str(), bed.strand.c_str(), + bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); + } } /* - reportBedNewLine + reportBedNewLine - Writes the _original_ BED entry with a NEWLINE - at the end of the line. - Works for BED3 - BED6. + Writes the _original_ BED entry with a NEWLINE + at the end of the line. + Works for BED3 - BED6. */ template <typename T> inline void reportBedNewLine(const T &bed) { //BED - if (_isGff == false && _isVcf == false) { - if (this->bedType == 3) { - printf ("%s\t%d\t%d\n", bed.chrom.c_str(), bed.start, bed.end); - } - else if (this->bedType == 4) { - printf ("%s\t%d\t%d\t%s\n", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str()); - } - else if (this->bedType == 5) { - printf ("%s\t%d\t%d\t%s\t%s\n", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str(), - bed.score.c_str()); - } - else if (this->bedType == 6) { - printf ("%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str(), - bed.score.c_str(), bed.strand.c_str()); - } - else if (this->bedType > 6) { - printf ("%s\t%d\t%d\t%s\t%s\t%s", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str(), - bed.score.c_str(), bed.strand.c_str()); - - vector<string>::const_iterator othIt = bed.otherFields.begin(); - vector<string>::const_iterator othEnd = bed.otherFields.end(); - for ( ; othIt != othEnd; ++othIt) { - printf("\t%s", othIt->c_str()); - } - printf("\n"); - } - } - // VCF - else if (_isGff == false && _isVcf == true) { - printf ("%s\t%d\t", bed.chrom.c_str(), bed.start+1); - - vector<string>::const_iterator othIt = bed.otherFields.begin(); - vector<string>::const_iterator othEnd = bed.otherFields.end(); - for ( ; othIt != othEnd; ++othIt) { - printf("%s\t", othIt->c_str()); - } - printf("\n"); - } - //GFF - else if (this->bedType == 9) { - printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(), - bed.name.c_str(), bed.start+1, bed.end, - bed.score.c_str(), bed.strand.c_str(), - bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); - } + if (_isGff == false && _isVcf == false) { + if (this->bedType == 3) { + printf ("%s\t%d\t%d\n", bed.chrom.c_str(), bed.start, bed.end); + } + else if (this->bedType == 4) { + printf ("%s\t%d\t%d\t%s\n", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str()); + } + else if (this->bedType == 5) { + printf ("%s\t%d\t%d\t%s\t%s\n", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str(), + bed.score.c_str()); + } + else if (this->bedType == 6) { + printf ("%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str(), + bed.score.c_str(), bed.strand.c_str()); + } + else if (this->bedType > 6) { + printf ("%s\t%d\t%d\t%s\t%s\t%s", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str(), + bed.score.c_str(), bed.strand.c_str()); + + vector<string>::const_iterator othIt = bed.otherFields.begin(); + vector<string>::const_iterator othEnd = bed.otherFields.end(); + for ( ; othIt != othEnd; ++othIt) { + printf("\t%s", othIt->c_str()); + } + printf("\n"); + } + } + // VCF + else if (_isGff == false && _isVcf == true) { + printf ("%s\t%d\t", bed.chrom.c_str(), bed.start+1); + + vector<string>::const_iterator othIt = bed.otherFields.begin(); + vector<string>::const_iterator othEnd = bed.otherFields.end(); + for ( ; othIt != othEnd; ++othIt) { + printf("%s\t", othIt->c_str()); + } + printf("\n"); + } + //GFF + else if (this->bedType == 9) { + printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(), + bed.name.c_str(), bed.start+1, bed.end, + bed.score.c_str(), bed.strand.c_str(), + bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); + } } /* - reportBedRangeNewLine + reportBedRangeNewLine - Writes a custom start->end for a BED entry - with a NEWLINE at the end of the line. + Writes a custom start->end for a BED entry + with a NEWLINE at the end of the line. - Works for BED3 - BED6. + Works for BED3 - BED6. */ template <typename T> inline void reportBedRangeTab(const T &bed, CHRPOS start, CHRPOS end) { // BED - if (_isGff == false && _isVcf == false) { - if (this->bedType == 3) { - printf ("%s\t%d\t%d\t", bed.chrom.c_str(), start, end); - } - else if (this->bedType == 4) { - printf ("%s\t%d\t%d\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str()); - } - else if (this->bedType == 5) { - printf ("%s\t%d\t%d\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), - bed.score.c_str()); - } - else if (this->bedType == 6) { - printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), - bed.score.c_str(), bed.strand.c_str()); - } - else if (this->bedType > 6) { - printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), - bed.score.c_str(), bed.strand.c_str()); - - vector<string>::const_iterator othIt = bed.otherFields.begin(); - vector<string>::const_iterator othEnd = bed.otherFields.end(); - for ( ; othIt != othEnd; ++othIt) { - printf("%s\t", othIt->c_str()); - } - } - } - // VCF - else if (_isGff == false && _isVcf == true) { - printf ("%s\t%d\t", bed.chrom.c_str(), bed.start+1); - - vector<string>::const_iterator othIt = bed.otherFields.begin(); - vector<string>::const_iterator othEnd = bed.otherFields.end(); - for ( ; othIt != othEnd; ++othIt) { - printf("%s\t", othIt->c_str()); - } - } - // GFF - else if (this->bedType == 9) { - printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(), - bed.name.c_str(), start+1, end, - bed.score.c_str(), bed.strand.c_str(), - bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); - } + if (_isGff == false && _isVcf == false) { + if (this->bedType == 3) { + printf ("%s\t%d\t%d\t", bed.chrom.c_str(), start, end); + } + else if (this->bedType == 4) { + printf ("%s\t%d\t%d\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str()); + } + else if (this->bedType == 5) { + printf ("%s\t%d\t%d\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), + bed.score.c_str()); + } + else if (this->bedType == 6) { + printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), + bed.score.c_str(), bed.strand.c_str()); + } + else if (this->bedType > 6) { + printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), + bed.score.c_str(), bed.strand.c_str()); + + vector<string>::const_iterator othIt = bed.otherFields.begin(); + vector<string>::const_iterator othEnd = bed.otherFields.end(); + for ( ; othIt != othEnd; ++othIt) { + printf("%s\t", othIt->c_str()); + } + } + } + // VCF + else if (_isGff == false && _isVcf == true) { + printf ("%s\t%d\t", bed.chrom.c_str(), bed.start+1); + + vector<string>::const_iterator othIt = bed.otherFields.begin(); + vector<string>::const_iterator othEnd = bed.otherFields.end(); + for ( ; othIt != othEnd; ++othIt) { + printf("%s\t", othIt->c_str()); + } + } + // GFF + else if (this->bedType == 9) { + printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(), + bed.name.c_str(), start+1, end, + bed.score.c_str(), bed.strand.c_str(), + bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); + } } /* - reportBedRangeTab + reportBedRangeTab - Writes a custom start->end for a BED entry - with a TAB at the end of the line. + Writes a custom start->end for a BED entry + with a TAB at the end of the line. - Works for BED3 - BED6. + Works for BED3 - BED6. */ template <typename T> inline void reportBedRangeNewLine(const T &bed, CHRPOS start, CHRPOS end) { // BED - if (_isGff == false && _isVcf == false) { - if (this->bedType == 3) { - printf ("%s\t%d\t%d\n", bed.chrom.c_str(), start, end); - } - else if (this->bedType == 4) { - printf ("%s\t%d\t%d\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str()); - } - else if (this->bedType == 5) { - printf ("%s\t%d\t%d\t%s\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str(), - bed.score.c_str()); - } - else if (this->bedType == 6) { - printf ("%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str(), - bed.score.c_str(), bed.strand.c_str()); - } - else if (this->bedType > 6) { - printf ("%s\t%d\t%d\t%s\t%s\t%s", bed.chrom.c_str(), start, end, bed.name.c_str(), - bed.score.c_str(), bed.strand.c_str()); - - vector<string>::const_iterator othIt = bed.otherFields.begin(); - vector<string>::const_iterator othEnd = bed.otherFields.end(); - for ( ; othIt != othEnd; ++othIt) { - printf("\t%s", othIt->c_str()); - } - printf("\n"); - } - } - // VCF - else if (_isGff == false && _isVcf == true) { - printf ("%s\t%d\t", bed.chrom.c_str(), bed.start+1); - - vector<string>::const_iterator othIt = bed.otherFields.begin(); - vector<string>::const_iterator othEnd = bed.otherFields.end(); - for ( ; othIt != othEnd; ++othIt) { - printf("%s\t", othIt->c_str()); - } + if (_isGff == false && _isVcf == false) { + if (this->bedType == 3) { + printf ("%s\t%d\t%d\n", bed.chrom.c_str(), start, end); + } + else if (this->bedType == 4) { + printf ("%s\t%d\t%d\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str()); + } + else if (this->bedType == 5) { + printf ("%s\t%d\t%d\t%s\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str(), + bed.score.c_str()); + } + else if (this->bedType == 6) { + printf ("%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str(), + bed.score.c_str(), bed.strand.c_str()); + } + else if (this->bedType > 6) { + printf ("%s\t%d\t%d\t%s\t%s\t%s", bed.chrom.c_str(), start, end, bed.name.c_str(), + bed.score.c_str(), bed.strand.c_str()); + + vector<string>::const_iterator othIt = bed.otherFields.begin(); + vector<string>::const_iterator othEnd = bed.otherFields.end(); + for ( ; othIt != othEnd; ++othIt) { + printf("\t%s", othIt->c_str()); + } + printf("\n"); + } + } + // VCF + else if (_isGff == false && _isVcf == true) { + printf ("%s\t%d\t", bed.chrom.c_str(), bed.start+1); + + vector<string>::const_iterator othIt = bed.otherFields.begin(); + vector<string>::const_iterator othEnd = bed.otherFields.end(); + for ( ; othIt != othEnd; ++othIt) { + printf("%s\t", othIt->c_str()); + } printf("\n"); - } - // GFF - else if (this->bedType == 9) { // add 1 to the start for GFF - printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(), - bed.name.c_str(), start+1, end, - bed.score.c_str(), bed.strand.c_str(), - bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); - } + } + // GFF + else if (this->bedType == 9) { // add 1 to the start for GFF + printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(), + bed.name.c_str(), start+1, end, + bed.score.c_str(), bed.strand.c_str(), + bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); + } } /* - reportNullBedTab + reportNullBedTab */ void reportNullBedTab() { - if (_isGff == false) { - if (this->bedType == 3) { - printf (".\t-1\t-1\t"); - } - else if (this->bedType == 4) { - printf (".\t-1\t-1\t.\t"); - } - else if (this->bedType == 5) { - printf (".\t-1\t-1\t.\t-1\t"); - } - else if (this->bedType == 6) { - printf (".\t-1\t-1\t.\t-1\t.\t"); - } - else if (this->bedType > 6) { - printf (".\t-1\t-1\t.\t-1\t.\t"); - for (unsigned int i = 6; i < this->bedType; ++i) { - printf(".\t"); - } - } - } - else if (this->bedType == 9) { - printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t.\t"); - } + if (_isGff == false) { + if (this->bedType == 3) { + printf (".\t-1\t-1\t"); + } + else if (this->bedType == 4) { + printf (".\t-1\t-1\t.\t"); + } + else if (this->bedType == 5) { + printf (".\t-1\t-1\t.\t-1\t"); + } + else if (this->bedType == 6) { + printf (".\t-1\t-1\t.\t-1\t.\t"); + } + else if (this->bedType > 6) { + printf (".\t-1\t-1\t.\t-1\t.\t"); + for (unsigned int i = 6; i < this->bedType; ++i) { + printf(".\t"); + } + } + } + else if (this->bedType == 9) { + printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t.\t"); + } } /* - reportNullBedTab + reportNullBedTab */ void reportNullBedNewLine() { - if (_isGff == false) { - if (this->bedType == 3) { - printf (".\t-1\t-1\n"); - } - else if (this->bedType == 4) { - printf (".\t-1\t-1\t.\n"); - } - else if (this->bedType == 5) { - printf (".\t-1\t-1\t.\t-1\n"); - } - else if (this->bedType == 6) { - printf (".\t-1\t-1\t.\t-1\t.\n"); - } - else if (this->bedType > 6) { - printf (".\t-1\t-1\t.\t-1\t."); - for (unsigned int i = 6; i < this->bedType; ++i) { - printf("\t."); - } - printf("\n"); - } - } - else if (this->bedType == 9) { - printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t.\n"); - } + if (_isGff == false) { + if (this->bedType == 3) { + printf (".\t-1\t-1\n"); + } + else if (this->bedType == 4) { + printf (".\t-1\t-1\t.\n"); + } + else if (this->bedType == 5) { + printf (".\t-1\t-1\t.\t-1\n"); + } + else if (this->bedType == 6) { + printf (".\t-1\t-1\t.\t-1\t.\n"); + } + else if (this->bedType > 6) { + printf (".\t-1\t-1\t.\t-1\t."); + for (unsigned int i = 6; i < this->bedType; ++i) { + printf("\t."); + } + printf("\n"); + } + } + else if (this->bedType == 9) { + printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t.\n"); + } } - + }; #endif /* BEDFILE_H */ diff --git a/src/utils/bedFilePE/bedFilePE.cpp b/src/utils/bedFilePE/bedFilePE.cpp index 98d3a200c9bf865086f199393765f420be3fca70..f9cd5d2220aad0e9f208aaa582461ad985895e67 100644 --- a/src/utils/bedFilePE/bedFilePE.cpp +++ b/src/utils/bedFilePE/bedFilePE.cpp @@ -1,7 +1,7 @@ -// +// // bedFilePE.cpp // BEDTools -// +// // Created by Aaron Quinlan Spring 2009. // Copyright 2009 Aaron Quinlan. All rights reserved. // @@ -9,7 +9,7 @@ // // Acknowledgments: Much of the code herein is taken from Jim Kent's // BED processing code. I am grateful for his elegant -// genome binning algorithm and therefore use it extensively. +// genome binning algorithm and therefore use it extensively. #include "bedFilePE.h" @@ -17,7 +17,7 @@ // Constructor BedFilePE::BedFilePE(string &bedFile) { - this->bedFile = bedFile; + this->bedFile = bedFile; } // Destructor @@ -25,512 +25,528 @@ BedFilePE::~BedFilePE(void) { } void BedFilePE::Open(void) { - if (bedFile == "stdin") { - _bedStream = &cin; - } - else { - size_t foundPos; - foundPos = bedFile.find_last_of(".gz"); - // is this a GZIPPED BED file? - if (foundPos == bedFile.size() - 1) { - igzstream beds(bedFile.c_str(), ios::in); - if ( !beds ) { - cerr << "Error: The requested bedpe file (" << bedFile << ") could not be opened. Exiting!" << endl; - exit (1); - } - else { - // if so, close it (this was just a test) - beds.close(); - // now set a pointer to the stream so that we - // can read the file later on. - // Thank God for Josuttis, p. 631! - _bedStream = new igzstream(bedFile.c_str(), ios::in); - } - } - // not GZIPPED. - else { - - ifstream beds(bedFile.c_str(), ios::in); - // can we open the file? - if ( !beds ) { - cerr << "Error: The requested bed file (" << bedFile << ") could not be opened. Exiting!" << endl; - exit (1); - } - else { - // if so, close it (this was just a test) - beds.close(); - // now set a pointer to the stream so that we - // can read the file later on. - // Thank God for Josuttis, p. 631! - _bedStream = new ifstream(bedFile.c_str(), ios::in); - } - } - } + if (bedFile == "stdin") { + _bedStream = &cin; + } + else { + size_t foundPos; + foundPos = bedFile.find_last_of(".gz"); + // is this a GZIPPED BED file? + if (foundPos == bedFile.size() - 1) { + igzstream beds(bedFile.c_str(), ios::in); + if ( !beds ) { + cerr << "Error: The requested bedpe file (" << bedFile << ") could not be opened. Exiting!" << endl; + exit (1); + } + else { + // if so, close it (this was just a test) + beds.close(); + // now set a pointer to the stream so that we + // can read the file later on. + // Thank God for Josuttis, p. 631! + _bedStream = new igzstream(bedFile.c_str(), ios::in); + } + } + // not GZIPPED. + else { + + ifstream beds(bedFile.c_str(), ios::in); + // can we open the file? + if ( !beds ) { + cerr << "Error: The requested bed file (" << bedFile << ") could not be opened. Exiting!" << endl; + exit (1); + } + else { + // if so, close it (this was just a test) + beds.close(); + // now set a pointer to the stream so that we + // can read the file later on. + // Thank God for Josuttis, p. 631! + _bedStream = new ifstream(bedFile.c_str(), ios::in); + } + } + } } // Close the BEDPE file void BedFilePE::Close(void) { - if (bedFile != "stdin") delete _bedStream; + if (bedFile != "stdin") delete _bedStream; } BedLineStatus BedFilePE::GetNextBedPE (BEDPE &bedpe, int &lineNum) { - // make sure there are still lines to process. - // if so, tokenize, validate and return the BEDPE entry. - if (_bedStream->good()) { - string bedPELine; - vector<string> bedPEFields; - bedPEFields.reserve(10); - - // parse the bedStream pointer - getline(*_bedStream, bedPELine); - lineNum++; - - // split into a string vector. - Tokenize(bedPELine,bedPEFields); - - // load the BEDPE struct as long as it's a valid BEDPE entry. - return parseLine(bedpe, bedPEFields, lineNum); - } - // default if file is closed or EOF - return BED_INVALID; + // make sure there are still lines to process. + // if so, tokenize, validate and return the BEDPE entry. + if (_bedStream->good()) { + string bedPELine; + vector<string> bedPEFields; + bedPEFields.reserve(10); + + // parse the bedStream pointer + getline(*_bedStream, bedPELine); + lineNum++; + + // split into a string vector. + Tokenize(bedPELine,bedPEFields); + + // load the BEDPE struct as long as it's a valid BEDPE entry. + return parseLine(bedpe, bedPEFields, lineNum); + } + // default if file is closed or EOF + return BED_INVALID; } /* - reportBedPETab - - Writes the _original_ BED entry for A. - Works for BEDPE only. + reportBedPETab + + Writes the _original_ BED entry for A. + Works for BEDPE only. */ void BedFilePE::reportBedPETab(const BEDPE &a) { - if (this->bedType == 6) { - printf("%s\t%d\t%d\t%s\t%d\t%d\t", a.chrom1.c_str(), a.start1, a.end1, - a.chrom2.c_str(), a.start2, a.end2); - } - else if (this->bedType == 7) { - printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t", a.chrom1.c_str(), a.start1, a.end1, - a.chrom2.c_str(), a.start2, a.end2, - a.name.c_str()); - } - else if (this->bedType == 8) { - printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t", a.chrom1.c_str(), a.start1, a.end1, - a.chrom2.c_str(), a.start2, a.end2, - a.name.c_str(), a.score.c_str()); - } - else if (this->bedType == 10) { - printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t", a.chrom1.c_str(), a.start1, a.end1, - a.chrom2.c_str(), a.start2, a.end2, - a.name.c_str(), a.score.c_str(), a.strand1.c_str(), a.strand2.c_str()); - } - else if (this->bedType > 10) { - printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s", a.chrom1.c_str(), a.start1, a.end1, - a.chrom2.c_str(), a.start2, a.end2, - a.name.c_str(), a.score.c_str(), a.strand1.c_str(), a.strand2.c_str()); - - vector<string>::const_iterator othIt = a.otherFields.begin(); - vector<string>::const_iterator othEnd = a.otherFields.end(); - for ( ; othIt != othEnd; ++othIt) { - printf("\t%s", othIt->c_str()); - } - printf("\t"); - } + if (this->bedType == 6) { + printf("%s\t%d\t%d\t%s\t%d\t%d\t", a.chrom1.c_str(), a.start1, a.end1, + a.chrom2.c_str(), a.start2, a.end2); + } + else if (this->bedType == 7) { + printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t", a.chrom1.c_str(), a.start1, a.end1, + a.chrom2.c_str(), a.start2, a.end2, + a.name.c_str()); + } + else if (this->bedType == 8) { + printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t", a.chrom1.c_str(), a.start1, a.end1, + a.chrom2.c_str(), a.start2, a.end2, + a.name.c_str(), a.score.c_str()); + } + else if (this->bedType == 10) { + printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t", a.chrom1.c_str(), a.start1, a.end1, + a.chrom2.c_str(), a.start2, a.end2, + a.name.c_str(), a.score.c_str(), a.strand1.c_str(), a.strand2.c_str()); + } + else if (this->bedType > 10) { + printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s", a.chrom1.c_str(), a.start1, a.end1, + a.chrom2.c_str(), a.start2, a.end2, + a.name.c_str(), a.score.c_str(), a.strand1.c_str(), a.strand2.c_str()); + + vector<string>::const_iterator othIt = a.otherFields.begin(); + vector<string>::const_iterator othEnd = a.otherFields.end(); + for ( ; othIt != othEnd; ++othIt) { + printf("\t%s", othIt->c_str()); + } + printf("\t"); + } } /* - reportBedPENewLine - - Writes the _original_ BED entry for A. - Works for BEDPE only. + reportBedPENewLine + + Writes the _original_ BED entry for A. + Works for BEDPE only. */ void BedFilePE::reportBedPENewLine(const BEDPE &a) { - if (this->bedType == 6) { - printf("%s\t%d\t%d\t%s\t%d\t%d\n", a.chrom1.c_str(), a.start1, a.end1, - a.chrom2.c_str(), a.start2, a.end2); - } - else if (this->bedType == 7) { - printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\n", a.chrom1.c_str(), a.start1, a.end1, - a.chrom2.c_str(), a.start2, a.end2, - a.name.c_str()); - } - else if (this->bedType == 8) { - printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\n", a.chrom1.c_str(), a.start1, a.end1, - a.chrom2.c_str(), a.start2, a.end2, - a.name.c_str(), a.score.c_str()); - } - else if (this->bedType == 10) { - printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", a.chrom1.c_str(), a.start1, a.end1, - a.chrom2.c_str(), a.start2, a.end2, - a.name.c_str(), a.score.c_str(), a.strand1.c_str(), a.strand2.c_str()); - } - else if (this->bedType > 10) { - printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s", a.chrom1.c_str(), a.start1, a.end1, - a.chrom2.c_str(), a.start2, a.end2, - a.name.c_str(), a.score.c_str(), a.strand1.c_str(), a.strand2.c_str()); - - vector<string>::const_iterator othIt = a.otherFields.begin(); - vector<string>::const_iterator othEnd = a.otherFields.end(); - for ( ; othIt != othEnd; ++othIt) { - printf("\t%s", othIt->c_str()); - } - printf("\n"); - } + if (this->bedType == 6) { + printf("%s\t%d\t%d\t%s\t%d\t%d\n", a.chrom1.c_str(), a.start1, a.end1, + a.chrom2.c_str(), a.start2, a.end2); + } + else if (this->bedType == 7) { + printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\n", a.chrom1.c_str(), a.start1, a.end1, + a.chrom2.c_str(), a.start2, a.end2, + a.name.c_str()); + } + else if (this->bedType == 8) { + printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\n", a.chrom1.c_str(), a.start1, a.end1, + a.chrom2.c_str(), a.start2, a.end2, + a.name.c_str(), a.score.c_str()); + } + else if (this->bedType == 10) { + printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", a.chrom1.c_str(), a.start1, a.end1, + a.chrom2.c_str(), a.start2, a.end2, + a.name.c_str(), a.score.c_str(), a.strand1.c_str(), a.strand2.c_str()); + } + else if (this->bedType > 10) { + printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s", a.chrom1.c_str(), a.start1, a.end1, + a.chrom2.c_str(), a.start2, a.end2, + a.name.c_str(), a.score.c_str(), a.strand1.c_str(), a.strand2.c_str()); + + vector<string>::const_iterator othIt = a.otherFields.begin(); + vector<string>::const_iterator othEnd = a.otherFields.end(); + for ( ; othIt != othEnd; ++othIt) { + printf("\t%s", othIt->c_str()); + } + printf("\n"); + } } BedLineStatus BedFilePE::parseLine (BEDPE &bedpe, const vector<string> &lineVector, int &lineNum) { - // bail out if we have a blank line - if (lineVector.empty()) - return BED_BLANK; - - if ((lineVector[0].find("track") == string::npos) && (lineVector[0].find("browser") == string::npos) && (lineVector[0].find("#") == string::npos) ) { - // we need at least 6 columns - if (lineVector.size() >= 6) { - if (parseBedPELine(bedpe, lineVector, lineNum) == true) - return BED_VALID; - else return BED_INVALID; - } - else { - cerr << "It looks as though you have less than 6 columns. Are you sure your files are tab-delimited?" << endl; - exit(1); - } - } - else { - lineNum--; - return BED_HEADER; - } - - // default - return BED_INVALID; + // bail out if we have a blank line + if (lineVector.empty()) + return BED_BLANK; + + if ((lineVector[0].find("track") == string::npos) && (lineVector[0].find("browser") == string::npos) && (lineVector[0].find("#") == string::npos) ) { + // we need at least 6 columns + if (lineVector.size() >= 6) { + if (parseBedPELine(bedpe, lineVector, lineNum) == true) + return BED_VALID; + else return BED_INVALID; + } + else { + cerr << "It looks as though you have less than 6 columns. Are you sure your files are tab-delimited?" << endl; + exit(1); + } + } + else { + lineNum--; + return BED_HEADER; + } + + // default + return BED_INVALID; } bool BedFilePE::parseBedPELine (BEDPE &bed, const vector<string> &lineVector, const int &lineNum) { - if ((lineNum == 1) && (lineVector.size() >= 6)) { - - this->bedType = lineVector.size(); - - if (this->bedType == 6) { - bed.chrom1 = lineVector[0]; - bed.start1 = atoi(lineVector[1].c_str()); - bed.end1 = atoi(lineVector[2].c_str()); - - bed.chrom2 = lineVector[3]; - bed.start2 = atoi(lineVector[4].c_str()); - bed.end2 = atoi(lineVector[5].c_str()); - - return true; - } - else if (this->bedType == 7) { - bed.chrom1 = lineVector[0]; - bed.start1 = atoi(lineVector[1].c_str()); - bed.end1 = atoi(lineVector[2].c_str()); - - bed.chrom2 = lineVector[3]; - bed.start2 = atoi(lineVector[4].c_str()); - bed.end2 = atoi(lineVector[5].c_str()); - - bed.name = lineVector[6]; - return true; - } - else if (this->bedType == 8) { - bed.chrom1 = lineVector[0]; - bed.start1 = atoi(lineVector[1].c_str()); - bed.end1 = atoi(lineVector[2].c_str()); - - bed.chrom2 = lineVector[3]; - bed.start2 = atoi(lineVector[4].c_str()); - bed.end2 = atoi(lineVector[5].c_str()); - - bed.name = lineVector[6]; - bed.score = lineVector[7].c_str(); - return true; - } - else if (this->bedType == 10) { - bed.chrom1 = lineVector[0]; - bed.start1 = atoi(lineVector[1].c_str()); - bed.end1 = atoi(lineVector[2].c_str()); - - bed.chrom2 = lineVector[3]; - bed.start2 = atoi(lineVector[4].c_str()); - bed.end2 = atoi(lineVector[5].c_str()); - - bed.name = lineVector[6]; - bed.score = lineVector[7].c_str(); - - bed.strand1 = lineVector[8]; - bed.strand2 = lineVector[9]; - - return true; - } - else if (this->bedType > 10) { - bed.chrom1 = lineVector[0]; - bed.start1 = atoi(lineVector[1].c_str()); - bed.end1 = atoi(lineVector[2].c_str()); - - bed.chrom2 = lineVector[3]; - bed.start2 = atoi(lineVector[4].c_str()); - bed.end2 = atoi(lineVector[5].c_str()); - - bed.name = lineVector[6]; - bed.score = lineVector[7].c_str(); - - bed.strand1 = lineVector[8]; - bed.strand2 = lineVector[9]; - - for (unsigned int i = 10; i < lineVector.size(); ++i) { - bed.otherFields.push_back(lineVector[i]); - } - return true; - } - else { - cerr << "Unexpected number of fields: " << lineNum << ". Verify that your files are TAB-delimited and that your BEDPE file has 6,7,8 or 10 fields. Exiting..." << endl; - exit(1); - } - - if (bed.start1 > bed.end1) { - cerr << "Error: malformed BEDPE entry at line " << lineNum << ". Start1 was greater than End1. Ignoring it and moving on." << endl; - return false; - } - else if (bed.start2 > bed.end2) { - cerr << "Error: malformed BEDPE entry at line " << lineNum << ". Start2 was greater than End2. Ignoring it and moving on." << endl; - return false; - } - else if ( (bed.start1 < 0) || (bed.end1 < 0) || (bed.start2 < 0) || (bed.end2 < 0) ) { - cerr << "Error: malformed BEDPE entry at line " << lineNum << ". Coordinate <= 0. Ignoring it and moving on." << endl; - return false; - } - } - else if ( (lineNum > 1) && (lineVector.size() == this->bedType)) { - - if (this->bedType == 6) { - bed.chrom1 = lineVector[0]; - bed.start1 = atoi(lineVector[1].c_str()); - bed.end1 = atoi(lineVector[2].c_str()); - - bed.chrom2 = lineVector[3]; - bed.start2 = atoi(lineVector[4].c_str()); - bed.end2 = atoi(lineVector[5].c_str()); - - return true; - } - else if (this->bedType == 7) { - bed.chrom1 = lineVector[0]; - bed.start1 = atoi(lineVector[1].c_str()); - bed.end1 = atoi(lineVector[2].c_str()); - - bed.chrom2 = lineVector[3]; - bed.start2 = atoi(lineVector[4].c_str()); - bed.end2 = atoi(lineVector[5].c_str()); - - bed.name = lineVector[6]; - return true; - } - else if (this->bedType == 8) { - bed.chrom1 = lineVector[0]; - bed.start1 = atoi(lineVector[1].c_str()); - bed.end1 = atoi(lineVector[2].c_str()); - - bed.chrom2 = lineVector[3]; - bed.start2 = atoi(lineVector[4].c_str()); - bed.end2 = atoi(lineVector[5].c_str()); - - bed.name = lineVector[6]; - bed.score = lineVector[7].c_str(); - return true; - } - else if (this->bedType == 10) { - bed.chrom1 = lineVector[0]; - bed.start1 = atoi(lineVector[1].c_str()); - bed.end1 = atoi(lineVector[2].c_str()); - - bed.chrom2 = lineVector[3]; - bed.start2 = atoi(lineVector[4].c_str()); - bed.end2 = atoi(lineVector[5].c_str()); - - bed.name = lineVector[6]; - bed.score = lineVector[7].c_str(); - - bed.strand1 = lineVector[8]; - bed.strand2 = lineVector[9]; - - return true; - } - else if (this->bedType > 10) { - bed.chrom1 = lineVector[0]; - bed.start1 = atoi(lineVector[1].c_str()); - bed.end1 = atoi(lineVector[2].c_str()); - - bed.chrom2 = lineVector[3]; - bed.start2 = atoi(lineVector[4].c_str()); - bed.end2 = atoi(lineVector[5].c_str()); - - bed.name = lineVector[6]; - bed.score = lineVector[7].c_str(); - - bed.strand1 = lineVector[8]; - bed.strand2 = lineVector[9]; - - for (unsigned int i = 10; i < lineVector.size(); ++i) { - bed.otherFields.push_back(lineVector[i]); - } - return true; - } - else { - cerr << "Unexpected number of fields: " << lineNum << ". Verify that your files are TAB-delimited and that your BEDPE file has 6,7,8 or 10 fields. Exiting..." << endl; - exit(1); - } - - if (bed.start1 > bed.end1) { - cerr << "Error: malformed BED entry at line " << lineNum << ". Start1 was greater than End1. Ignoring it and moving on." << endl; - return false; - } - else if (bed.start2 > bed.end2) { - cerr << "Error: malformed BED entry at line " << lineNum << ". Start2 was greater than End2. Ignoring it and moving on." << endl; - return false; - } - else if ( (bed.start1 < 0) || (bed.end1 < 0) || (bed.start2 < 0) || (bed.end2 < 0) ) { - cerr << "Error: malformed BED entry at line " << lineNum << ". Coordinate <= 0. Ignoring it and moving on." << endl; - return false; - } - } - else if (lineVector.size() == 1) { - cerr << "Only one BED field detected: " << lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl; - exit(1); - } - else if ((lineVector.size() != this->bedType) && (lineVector.size() != 0)) { - cerr << "Differing number of BEDPE fields encountered at line: " << lineNum << ". Exiting..." << endl; - exit(1); - } - else if ((lineVector.size() < 6) && (lineVector.size() != 0)) { - cerr << "TAB delimited BEDPE file with at least 6 fields (chrom1, start1, end1, chrom2, start2, end2) is required at line: "<< lineNum << ". Exiting..." << endl; - exit(1); - } - return false; + if ((lineNum == 1) && (lineVector.size() >= 6)) { + + this->bedType = lineVector.size(); + + if (this->bedType == 6) { + bed.chrom1 = lineVector[0]; + bed.start1 = atoi(lineVector[1].c_str()); + bed.end1 = atoi(lineVector[2].c_str()); + + bed.chrom2 = lineVector[3]; + bed.start2 = atoi(lineVector[4].c_str()); + bed.end2 = atoi(lineVector[5].c_str()); + + return true; + } + else if (this->bedType == 7) { + bed.chrom1 = lineVector[0]; + bed.start1 = atoi(lineVector[1].c_str()); + bed.end1 = atoi(lineVector[2].c_str()); + + bed.chrom2 = lineVector[3]; + bed.start2 = atoi(lineVector[4].c_str()); + bed.end2 = atoi(lineVector[5].c_str()); + + bed.name = lineVector[6]; + return true; + } + else if (this->bedType == 8) { + bed.chrom1 = lineVector[0]; + bed.start1 = atoi(lineVector[1].c_str()); + bed.end1 = atoi(lineVector[2].c_str()); + + bed.chrom2 = lineVector[3]; + bed.start2 = atoi(lineVector[4].c_str()); + bed.end2 = atoi(lineVector[5].c_str()); + + bed.name = lineVector[6]; + bed.score = lineVector[7].c_str(); + return true; + } + else if (this->bedType == 10) { + bed.chrom1 = lineVector[0]; + bed.start1 = atoi(lineVector[1].c_str()); + bed.end1 = atoi(lineVector[2].c_str()); + + bed.chrom2 = lineVector[3]; + bed.start2 = atoi(lineVector[4].c_str()); + bed.end2 = atoi(lineVector[5].c_str()); + + bed.name = lineVector[6]; + bed.score = lineVector[7].c_str(); + + bed.strand1 = lineVector[8]; + bed.strand2 = lineVector[9]; + + return true; + } + else if (this->bedType > 10) { + bed.chrom1 = lineVector[0]; + bed.start1 = atoi(lineVector[1].c_str()); + bed.end1 = atoi(lineVector[2].c_str()); + + bed.chrom2 = lineVector[3]; + bed.start2 = atoi(lineVector[4].c_str()); + bed.end2 = atoi(lineVector[5].c_str()); + + bed.name = lineVector[6]; + bed.score = lineVector[7].c_str(); + + bed.strand1 = lineVector[8]; + bed.strand2 = lineVector[9]; + + for (unsigned int i = 10; i < lineVector.size(); ++i) { + bed.otherFields.push_back(lineVector[i]); + } + return true; + } + else { + cerr << "Unexpected number of fields: " << lineNum << ". Verify that your files are TAB-delimited and that your BEDPE file has 6,7,8 or 10 fields. Exiting..." << endl; + exit(1); + } + + if (bed.start1 > bed.end1) { + cerr << "Error: malformed BEDPE entry at line " << lineNum << ". Start1 was greater than End1. Ignoring it and moving on." << endl; + return false; + } + else if (bed.start2 > bed.end2) { + cerr << "Error: malformed BEDPE entry at line " << lineNum << ". Start2 was greater than End2. Ignoring it and moving on." << endl; + return false; + } + else if ( (bed.start1 < 0) || (bed.end1 < 0) || (bed.start2 < 0) || (bed.end2 < 0) ) { + cerr << "Error: malformed BEDPE entry at line " << lineNum << ". Coordinate <= 0. Ignoring it and moving on." << endl; + return false; + } + } + else if ( (lineNum > 1) && (lineVector.size() == this->bedType)) { + + if (this->bedType == 6) { + bed.chrom1 = lineVector[0]; + bed.start1 = atoi(lineVector[1].c_str()); + bed.end1 = atoi(lineVector[2].c_str()); + + bed.chrom2 = lineVector[3]; + bed.start2 = atoi(lineVector[4].c_str()); + bed.end2 = atoi(lineVector[5].c_str()); + + return true; + } + else if (this->bedType == 7) { + bed.chrom1 = lineVector[0]; + bed.start1 = atoi(lineVector[1].c_str()); + bed.end1 = atoi(lineVector[2].c_str()); + + bed.chrom2 = lineVector[3]; + bed.start2 = atoi(lineVector[4].c_str()); + bed.end2 = atoi(lineVector[5].c_str()); + + bed.name = lineVector[6]; + return true; + } + else if (this->bedType == 8) { + bed.chrom1 = lineVector[0]; + bed.start1 = atoi(lineVector[1].c_str()); + bed.end1 = atoi(lineVector[2].c_str()); + + bed.chrom2 = lineVector[3]; + bed.start2 = atoi(lineVector[4].c_str()); + bed.end2 = atoi(lineVector[5].c_str()); + + bed.name = lineVector[6]; + bed.score = lineVector[7].c_str(); + return true; + } + else if (this->bedType == 10) { + bed.chrom1 = lineVector[0]; + bed.start1 = atoi(lineVector[1].c_str()); + bed.end1 = atoi(lineVector[2].c_str()); + + bed.chrom2 = lineVector[3]; + bed.start2 = atoi(lineVector[4].c_str()); + bed.end2 = atoi(lineVector[5].c_str()); + + bed.name = lineVector[6]; + bed.score = lineVector[7].c_str(); + + bed.strand1 = lineVector[8]; + bed.strand2 = lineVector[9]; + + return true; + } + else if (this->bedType > 10) { + bed.chrom1 = lineVector[0]; + bed.start1 = atoi(lineVector[1].c_str()); + bed.end1 = atoi(lineVector[2].c_str()); + + bed.chrom2 = lineVector[3]; + bed.start2 = atoi(lineVector[4].c_str()); + bed.end2 = atoi(lineVector[5].c_str()); + + bed.name = lineVector[6]; + bed.score = lineVector[7].c_str(); + + bed.strand1 = lineVector[8]; + bed.strand2 = lineVector[9]; + + for (unsigned int i = 10; i < lineVector.size(); ++i) { + bed.otherFields.push_back(lineVector[i]); + } + return true; + } + else { + cerr << "Unexpected number of fields: " << lineNum << ". Verify that your files are TAB-delimited and that your BEDPE file has 6,7,8 or 10 fields. Exiting..." << endl; + exit(1); + } + + if (bed.start1 > bed.end1) { + cerr << "Error: malformed BED entry at line " << lineNum << ". Start1 was greater than End1. Ignoring it and moving on." << endl; + return false; + } + else if (bed.start2 > bed.end2) { + cerr << "Error: malformed BED entry at line " << lineNum << ". Start2 was greater than End2. Ignoring it and moving on." << endl; + return false; + } + else if ( (bed.start1 < 0) || (bed.end1 < 0) || (bed.start2 < 0) || (bed.end2 < 0) ) { + cerr << "Error: malformed BED entry at line " << lineNum << ". Coordinate <= 0. Ignoring it and moving on." << endl; + return false; + } + } + else if (lineVector.size() == 1) { + cerr << "Only one BED field detected: " << lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl; + exit(1); + } + else if ((lineVector.size() != this->bedType) && (lineVector.size() != 0)) { + cerr << "Differing number of BEDPE fields encountered at line: " << lineNum << ". Exiting..." << endl; + exit(1); + } + else if ((lineVector.size() < 6) && (lineVector.size() != 0)) { + cerr << "TAB delimited BEDPE file with at least 6 fields (chrom1, start1, end1, chrom2, start2, end2) is required at line: "<< lineNum << ". Exiting..." << endl; + exit(1); + } + return false; } /* - Adapted from kent source "binKeeperFind" + Adapted from kent source "binKeeperFind" */ -void BedFilePE::FindOverlapsPerBin(int bEnd, string chrom, CHRPOS start, CHRPOS end, string strand, vector<MATE> &hits, bool forceStrand) { - - int startBin, endBin; - startBin = (start >> _binFirstShift); - endBin = ((end-1) >> _binFirstShift); - - // loop through each bin "level" in the binning hierarchy - for (int i = 0; i < _binLevels; ++i) { - - // loop through each bin at this level of the hierarchy - int offset = _binOffsetsExtended[i]; - for (int j = (startBin+offset); j <= (endBin+offset); ++j) { - - // loop through each feature in this chrom/bin and see if it overlaps - // with the feature that was passed in. if so, add the feature to - // the list of hits. - vector<MATE>::const_iterator bedItr; - vector<MATE>::const_iterator bedEnd; - if (bEnd == 1) { - bedItr = bedMapEnd1[chrom][j].begin(); - bedEnd = bedMapEnd1[chrom][j].end(); - } - else if (bEnd == 2) { - bedItr = bedMapEnd2[chrom][j].begin(); - bedEnd = bedMapEnd2[chrom][j].end(); - } - else { - cerr << "Unexpected end of B requested" << endl; - } - for (; bedItr != bedEnd; ++bedItr) { - - // skip the hit if not on the same strand (and we care) - if (forceStrand && (strand != bedItr->bed.strand)) { - continue; - } - else if (overlaps(bedItr->bed.start, bedItr->bed.end, start, end) > 0) { - hits.push_back(*bedItr); // it's a hit, add it. - } - - } - } - startBin >>= _binNextShift; - endBin >>= _binNextShift; - } +void BedFilePE::FindOverlapsPerBin(int bEnd, string chrom, CHRPOS start, CHRPOS end, string name, string strand, + vector<MATE> &hits, float overlapFraction, bool forceStrand, bool enforceDiffNames) { + + int startBin, endBin; + startBin = (start >> _binFirstShift); + endBin = ((end-1) >> _binFirstShift); + + // loop through each bin "level" in the binning hierarchy + for (int i = 0; i < _binLevels; ++i) { + + // loop through each bin at this level of the hierarchy + int offset = _binOffsetsExtended[i]; + for (int j = (startBin+offset); j <= (endBin+offset); ++j) { + + // loop through each feature in this chrom/bin and see if it overlaps + // with the feature that was passed in. if so, add the feature to + // the list of hits. + vector<MATE>::const_iterator bedItr; + vector<MATE>::const_iterator bedEnd; + if (bEnd == 1) { + bedItr = bedMapEnd1[chrom][j].begin(); + bedEnd = bedMapEnd1[chrom][j].end(); + } + else if (bEnd == 2) { + bedItr = bedMapEnd2[chrom][j].begin(); + bedEnd = bedMapEnd2[chrom][j].end(); + } + else { + cerr << "Unexpected end of B requested" << endl; + } + for (; bedItr != bedEnd; ++bedItr) { + float overlap = overlaps(bedItr->bed.start, bedItr->bed.end, start, end); + float size = end - start; + + if ( (overlap / size) >= overlapFraction ) { + + // skip the hit if not on the same strand (and we care) + if ((forceStrand == false) && (enforceDiffNames == false)) { + hits.push_back(*bedItr); // it's a hit, add it. + } + else if ((forceStrand == true) && (enforceDiffNames == false)) { + if (strand == bedItr->bed.strand) + hits.push_back(*bedItr); // it's a hit, add it. + } + else if ((forceStrand == true) && (enforceDiffNames == true)) { + if ((strand == bedItr->bed.strand) && (name != bedItr->bed.name)) + hits.push_back(*bedItr); // it's a hit, add it. + } + else if ((forceStrand == false) && (enforceDiffNames == true)) { + if (name != bedItr->bed.name) + hits.push_back(*bedItr); // it's a hit, add it. + } + } + + } + } + startBin >>= _binNextShift; + endBin >>= _binNextShift; + } } void BedFilePE::loadBedPEFileIntoMap() { - int lineNum = 0; - int bin1, bin2; - BedLineStatus bedStatus; - BEDPE bedpeEntry, nullBedPE; + int lineNum = 0; + int bin1, bin2; + BedLineStatus bedStatus; + BEDPE bedpeEntry, nullBedPE; - Open(); - bedStatus = this->GetNextBedPE(bedpeEntry, lineNum); - while (bedStatus != BED_INVALID) { - - if (bedStatus == BED_VALID) { + Open(); + bedStatus = this->GetNextBedPE(bedpeEntry, lineNum); + while (bedStatus != BED_INVALID) { + + if (bedStatus == BED_VALID) { MATE *bedEntry1 = new MATE(); MATE *bedEntry2 = new MATE(); - // separate the BEDPE entry into separate - // BED entries - splitBedPEIntoBeds(bedpeEntry, lineNum, bedEntry1, bedEntry2); - - // load end1 into a UCSC bin map - bin1 = getBin(bedEntry1->bed.start, bedEntry1->bed.end); - this->bedMapEnd1[bedEntry1->bed.chrom][bin1].push_back(*bedEntry1); - - // load end2 into a UCSC bin map - bin2 = getBin(bedEntry2->bed.start, bedEntry2->bed.end); - this->bedMapEnd2[bedEntry2->bed.chrom][bin2].push_back(*bedEntry2); - - bedpeEntry = nullBedPE; - } - bedStatus = this->GetNextBedPE(bedpeEntry, lineNum); - } - Close(); + // separate the BEDPE entry into separate + // BED entries + splitBedPEIntoBeds(bedpeEntry, lineNum, bedEntry1, bedEntry2); + + // load end1 into a UCSC bin map + bin1 = getBin(bedEntry1->bed.start, bedEntry1->bed.end); + this->bedMapEnd1[bedEntry1->bed.chrom][bin1].push_back(*bedEntry1); + + // load end2 into a UCSC bin map + bin2 = getBin(bedEntry2->bed.start, bedEntry2->bed.end); + this->bedMapEnd2[bedEntry2->bed.chrom][bin2].push_back(*bedEntry2); + + bedpeEntry = nullBedPE; + } + bedStatus = this->GetNextBedPE(bedpeEntry, lineNum); + } + Close(); } void BedFilePE::splitBedPEIntoBeds(const BEDPE &bedpeEntry, const int &lineNum, MATE *bedEntry1, MATE *bedEntry2) { - - /* - Split the BEDPE entry into separate BED entries - - NOTE: I am using a trick here where I store - the lineNum of the BEDPE from the original file - in the "count" column. This allows me to later - resolve whether the hits found on both ends of BEDPE A - came from the same entry in BEDPE B. Tracking by "name" - alone with fail when there are multiple mappings for a given - read-pair. - */ - - bedEntry1->bed.chrom = bedpeEntry.chrom1; - bedEntry1->bed.start = bedpeEntry.start1; - bedEntry1->bed.end = bedpeEntry.end1; - bedEntry1->bed.name = bedpeEntry.name; // only store the name in end1 to save memory - bedEntry1->bed.score = bedpeEntry.score; // only store the score in end1 to save memory - bedEntry1->bed.strand = bedpeEntry.strand1; - bedEntry1->bed.otherFields = bedpeEntry.otherFields; // only store the otherFields in end1 to save memory - bedEntry1->lineNum = lineNum; + + /* + Split the BEDPE entry into separate BED entries + + NOTE: I am using a trick here where I store + the lineNum of the BEDPE from the original file + in the "count" column. This allows me to later + resolve whether the hits found on both ends of BEDPE A + came from the same entry in BEDPE B. Tracking by "name" + alone with fail when there are multiple mappings for a given + read-pair. + */ + + bedEntry1->bed.chrom = bedpeEntry.chrom1; + bedEntry1->bed.start = bedpeEntry.start1; + bedEntry1->bed.end = bedpeEntry.end1; + bedEntry1->bed.name = bedpeEntry.name; + bedEntry1->bed.score = bedpeEntry.score; // only store the score in end1 to save memory + bedEntry1->bed.strand = bedpeEntry.strand1; + bedEntry1->bed.otherFields = bedpeEntry.otherFields; // only store the otherFields in end1 to save memory + bedEntry1->lineNum = lineNum; bedEntry1->mate = bedEntry2; // keep a pointer to end2 - - bedEntry2->bed.chrom = bedpeEntry.chrom2; - bedEntry2->bed.start = bedpeEntry.start2; - bedEntry2->bed.end = bedpeEntry.end2; - bedEntry2->bed.strand = bedpeEntry.strand2; - bedEntry2->lineNum = lineNum; + + bedEntry2->bed.chrom = bedpeEntry.chrom2; + bedEntry2->bed.start = bedpeEntry.start2; + bedEntry2->bed.end = bedpeEntry.end2; + bedEntry2->bed.name = bedpeEntry.name; + bedEntry2->bed.strand = bedpeEntry.strand2; + bedEntry2->lineNum = lineNum; bedEntry2->mate = bedEntry1; // keep a pointer to end1 } diff --git a/src/utils/bedFilePE/bedFilePE.h b/src/utils/bedFilePE/bedFilePE.h index e6e43ed06b7fe719f31545d6b269c79790e436cd..00dedbefcdc6ea6f51a0ddd2f20add356fd4f0c5 100644 --- a/src/utils/bedFilePE/bedFilePE.h +++ b/src/utils/bedFilePE/bedFilePE.h @@ -16,26 +16,26 @@ using namespace std; /* - Structure for paired-end records + Structure for paired-end records */ struct BEDPE { - // UCSC BED fields - string chrom1; - CHRPOS start1; - CHRPOS end1; - - string chrom2; - CHRPOS start2; - CHRPOS end2; - - string name; - string score; - - string strand1; - string strand2; - - vector<string> otherFields; + // UCSC BED fields + string chrom1; + CHRPOS start1; + CHRPOS end1; + + string chrom2; + CHRPOS start2; + CHRPOS end2; + + string name; + string score; + + string strand1; + string strand2; + + vector<string> otherFields; }; @@ -48,46 +48,46 @@ class BedFilePE { public: - // Constructor - BedFilePE(string &); - - // Destructor - ~BedFilePE(void); - - // Open a BEDPE file for reading (creates an istream pointer) - void Open(void); - - // Close an opened BEDPE file. - void Close(void); - - // Get the next BED entry in an opened BED file. - BedLineStatus GetNextBedPE (BEDPE &bedpe, int &lineNum); - - - // Methods - - void reportBedPETab(const BEDPE &a); - void reportBedPENewLine(const BEDPE &a); - void loadBedPEFileIntoMap(); - void splitBedPEIntoBeds(const BEDPE &a, const int &lineNum, MATE *bedEntry1, MATE *bedEntry2); - - - void FindOverlapsPerBin(int bEnd, string chrom, CHRPOS start, CHRPOS end, string strand, - vector<MATE> &hits, bool forceStrand); - - - string bedFile; - unsigned int bedType; - - masterMateMap bedMapEnd1; - masterMateMap bedMapEnd2; - + // Constructor + BedFilePE(string &); + + // Destructor + ~BedFilePE(void); + + // Open a BEDPE file for reading (creates an istream pointer) + void Open(void); + + // Close an opened BEDPE file. + void Close(void); + + // Get the next BED entry in an opened BED file. + BedLineStatus GetNextBedPE (BEDPE &bedpe, int &lineNum); + + + // Methods + + void reportBedPETab(const BEDPE &a); + void reportBedPENewLine(const BEDPE &a); + void loadBedPEFileIntoMap(); + void splitBedPEIntoBeds(const BEDPE &a, const int &lineNum, MATE *bedEntry1, MATE *bedEntry2); + + + void FindOverlapsPerBin(int bEnd, string chrom, CHRPOS start, CHRPOS end, string name, string strand, + vector<MATE> &hits, float overlapFraction, bool forceStrand, bool enforceDiffNames); + + + string bedFile; + unsigned int bedType; + + masterMateMap bedMapEnd1; + masterMateMap bedMapEnd2; + private: - istream *_bedStream; - - // methods - BedLineStatus parseLine (BEDPE &bedpe, const vector<string> &lineVector, int &lineNum); - bool parseBedPELine (BEDPE &bed, const vector<string> &lineVector, const int &lineNum); + istream *_bedStream; + + // methods + BedLineStatus parseLine (BEDPE &bedpe, const vector<string> &lineVector, int &lineNum); + bool parseBedPELine (BEDPE &bed, const vector<string> &lineVector, const int &lineNum); }; #endif /* BEDFILEPE_H */ diff --git a/src/utils/bedGraphFile/bedGraphFile.cpp b/src/utils/bedGraphFile/bedGraphFile.cpp index 3ad8aa4abf99cfa627d005b86ea6cedc06a8cbd2..44b87bf9f635b8e64e0b9ef05f4170b082078ae4 100644 --- a/src/utils/bedGraphFile/bedGraphFile.cpp +++ b/src/utils/bedGraphFile/bedGraphFile.cpp @@ -14,26 +14,26 @@ // Constructor BedGraphFile::BedGraphFile(string &_file) : - bedGraphFile(_file), - _bedGraphStream(NULL) + bedGraphFile(_file), + _bedGraphStream(NULL) {} // Destructor BedGraphFile::~BedGraphFile() { - Close(); + Close(); } // Open the BEDGRAPH file void BedGraphFile::Open() { - if (bedGraphFile == "stdin") { - _bedGraphStream = &cin; - return; - } - // unzipped, regular + if (bedGraphFile == "stdin") { + _bedGraphStream = &cin; + return; + } + // unzipped, regular else if ((isGzipFile(bedGraphFile) == false) && (isRegularFile(bedGraphFile) == true)) { - _bedGraphStream = new ifstream(bedGraphFile.c_str(), ios::in); + _bedGraphStream = new ifstream(bedGraphFile.c_str(), ios::in); // open an ifstream ifstream bedg(bedGraphFile.c_str(), ios::in); @@ -45,24 +45,24 @@ void BedGraphFile::Open() { } else { // if so, close it (this was just a test) - bedg.close(); + bedg.close(); // now set a pointer to the stream so that we _bedGraphStream = new ifstream(bedGraphFile.c_str(), ios::in); } } - else if ((isGzipFile(bedGraphFile) == true) && (isRegularFile(bedGraphFile) == true)) { - + else if ((isGzipFile(bedGraphFile) == true) && (isRegularFile(bedGraphFile) == true)) { + igzstream bedg(bedGraphFile.c_str(), ios::in); - if ( !bedg ) { - cerr << "Error: The requested bedgraph file (" << bedGraphFile << ") could not be opened. Exiting!" << endl; - exit (1); - } - else { - // if so, close it (this was just a test) - bedg.close(); - // now set a pointer to the stream so that we - _bedGraphStream = new igzstream(bedGraphFile.c_str(), ios::in); - } + if ( !bedg ) { + cerr << "Error: The requested bedgraph file (" << bedGraphFile << ") could not be opened. Exiting!" << endl; + exit (1); + } + else { + // if so, close it (this was just a test) + bedg.close(); + // now set a pointer to the stream so that we + _bedGraphStream = new igzstream(bedGraphFile.c_str(), ios::in); + } } else { cerr << "Error: Unexpected file type (" << bedGraphFile << "). Exiting!" << endl; @@ -73,11 +73,11 @@ void BedGraphFile::Open() { // Close the BEDGRAPH file void BedGraphFile::Close() { - if (bedGraphFile != "stdin") { - if (_bedGraphStream) { - delete _bedGraphStream; - _bedGraphStream = NULL ; - } - } + if (bedGraphFile != "stdin") { + if (_bedGraphStream) { + delete _bedGraphStream; + _bedGraphStream = NULL ; + } + } } diff --git a/src/utils/bedGraphFile/bedGraphFile.h b/src/utils/bedGraphFile/bedGraphFile.h index f37db126f51b75683596902c8c1266e9c88a18a4..41e860de562c015e7dfd1bd27a211dd3b1d09c74 100644 --- a/src/utils/bedGraphFile/bedGraphFile.h +++ b/src/utils/bedGraphFile/bedGraphFile.h @@ -48,29 +48,29 @@ template <typename T> class BEDGRAPH { public: - std::string chrom; - CHRPOS start; - CHRPOS end; - T depth; + std::string chrom; + CHRPOS start; + CHRPOS end; + T depth; public: - typedef T DEPTH_TYPE; - // constructors - - // Null - BEDGRAPH() : - start(0), - end(0), - depth(T()) - {} - - // BEDGraph - BEDGRAPH(string _chrom, CHRPOS _start, CHRPOS _end, T _depth) : - chrom(_chrom), - start(_start), - end(_end), - depth(_depth) - {} + typedef T DEPTH_TYPE; + // constructors + + // Null + BEDGRAPH() : + start(0), + end(0), + depth(T()) + {} + + // BEDGraph + BEDGRAPH(string _chrom, CHRPOS _start, CHRPOS _end, T _depth) : + chrom(_chrom), + start(_start), + end(_end), + depth(_depth) + {} }; // BEDGraph typedef BEDGRAPH<int32_t> BEDGRAPH_INT; @@ -80,11 +80,11 @@ typedef BEDGRAPH<double> BEDGRAPH_FLOAT; template <typename T> std::ostream& operator<< (std::ostream& strm, const BEDGRAPH<T>& bg) { - strm << bg.chrom << "\t" - << bg.start << "\t" - << bg.end << "\t" - << bg.depth; - return strm; + strm << bg.chrom << "\t" + << bg.start << "\t" + << bg.end << "\t" + << bg.depth; + return strm; } // enum to flag the state of a given line in a BEDGraph file. @@ -104,96 +104,96 @@ class BedGraphFile { public: - // Constructor - BedGraphFile(string &); - - // Destructor - ~BedGraphFile(void); - - // Open a BEDGraph file for reading (creates an istream pointer) - void Open(void); - - // Close an opened BED file. - void Close(void); - - // Get the next BED entry in an opened BED file. - template <typename T> - BedGraphLineStatus GetNextBedGraph (BEDGRAPH<T> &bedgraph, int &lineNum) - { - // make sure there are still lines to process. - // if so, tokenize, validate and return the BED entry. - if (_bedGraphStream->good()) { - string bedGraphLine; - vector<string> bedGraphFields; - - // parse the bedStream pointer - getline(*_bedGraphStream, bedGraphLine); - if (_bedGraphStream->eof()) - return BEDGRAPH_INVALID; - if (_bedGraphStream->bad()) { - cerr << "Error while reading file '" << bedGraphFile << "' : " - << strerror(errno) << endl; - exit(1); - } - lineNum++; - - // split into a string vector. - Tokenize(bedGraphLine,bedGraphFields); - - // load the BED struct as long as it's a valid BED entry. - return parseLine(bedgraph, bedGraphFields, lineNum); - } - - // default if file is closed or EOF - return BEDGRAPH_INVALID; - } - - // the bedfile with which this instance is associated - string bedGraphFile; + // Constructor + BedGraphFile(string &); + + // Destructor + ~BedGraphFile(void); + + // Open a BEDGraph file for reading (creates an istream pointer) + void Open(void); + + // Close an opened BED file. + void Close(void); + + // Get the next BED entry in an opened BED file. + template <typename T> + BedGraphLineStatus GetNextBedGraph (BEDGRAPH<T> &bedgraph, int &lineNum) + { + // make sure there are still lines to process. + // if so, tokenize, validate and return the BED entry. + if (_bedGraphStream->good()) { + string bedGraphLine; + vector<string> bedGraphFields; + + // parse the bedStream pointer + getline(*_bedGraphStream, bedGraphLine); + if (_bedGraphStream->eof()) + return BEDGRAPH_INVALID; + if (_bedGraphStream->bad()) { + cerr << "Error while reading file '" << bedGraphFile << "' : " + << strerror(errno) << endl; + exit(1); + } + lineNum++; + + // split into a string vector. + Tokenize(bedGraphLine,bedGraphFields); + + // load the BED struct as long as it's a valid BED entry. + return parseLine(bedgraph, bedGraphFields, lineNum); + } + + // default if file is closed or EOF + return BEDGRAPH_INVALID; + } + + // the bedfile with which this instance is associated + string bedGraphFile; private: - // data - istream *_bedGraphStream; - - template <typename T> - BedGraphLineStatus parseLine (BEDGRAPH<T> &bg, const vector<string> &lineVector, int &lineNum) - { - if (lineVector.size() == 0) - return BEDGRAPH_BLANK; - - if (lineVector[0].find("track") != string::npos || - lineVector[0].find("browser") != string::npos || - lineVector[0].find("#") != string::npos) - return BEDGRAPH_HEADER; - - if (lineVector.size() != 4) - return BEDGRAPH_INVALID; - - bg.chrom = lineVector[0]; - - stringstream str_start(lineVector[1]); - if (! (str_start >> bg.start) ) { - cerr << "Input error, failed to extract start value from '" << lineVector[1] - << "' (column 2) in " << bedGraphFile << " line " << lineNum << endl; - exit(1); - } - - stringstream str_end(lineVector[2]); - if (! (str_end >> bg.end) ) { - cerr << "Input error, failed to extract end value from '" << lineVector[2] - << "' (column 3) in " << bedGraphFile << " line " << lineNum << endl; - exit(1); - } - - stringstream str_depth(lineVector[3]); - if (! (str_depth >> bg.depth) ) { - cerr << "Input error, failed to extract depth value from '" << lineVector[3] - << "' (column 4) in " << bedGraphFile << " line " << lineNum << endl; - exit(1); - } - - return BEDGRAPH_VALID; - } + // data + istream *_bedGraphStream; + + template <typename T> + BedGraphLineStatus parseLine (BEDGRAPH<T> &bg, const vector<string> &lineVector, int &lineNum) + { + if (lineVector.size() == 0) + return BEDGRAPH_BLANK; + + if (lineVector[0].find("track") != string::npos || + lineVector[0].find("browser") != string::npos || + lineVector[0].find("#") != string::npos) + return BEDGRAPH_HEADER; + + if (lineVector.size() != 4) + return BEDGRAPH_INVALID; + + bg.chrom = lineVector[0]; + + stringstream str_start(lineVector[1]); + if (! (str_start >> bg.start) ) { + cerr << "Input error, failed to extract start value from '" << lineVector[1] + << "' (column 2) in " << bedGraphFile << " line " << lineNum << endl; + exit(1); + } + + stringstream str_end(lineVector[2]); + if (! (str_end >> bg.end) ) { + cerr << "Input error, failed to extract end value from '" << lineVector[2] + << "' (column 3) in " << bedGraphFile << " line " << lineNum << endl; + exit(1); + } + + stringstream str_depth(lineVector[3]); + if (! (str_depth >> bg.depth) ) { + cerr << "Input error, failed to extract depth value from '" << lineVector[3] + << "' (column 4) in " << bedGraphFile << " line " << lineNum << endl; + exit(1); + } + + return BEDGRAPH_VALID; + } }; #endif /* BEDFILE_H */ diff --git a/src/utils/curl/Makefile.am b/src/utils/curl/Makefile.am deleted file mode 100644 index a3b4443865af9ea14812ee0dfe3dc85d34c34f77..0000000000000000000000000000000000000000 --- a/src/utils/curl/Makefile.am +++ /dev/null @@ -1,25 +0,0 @@ -pkginclude_HEADERS = \ - curl.h curlver.h easy.h mprintf.h stdcheaders.h types.h multi.h \ - typecheck-gcc.h curlbuild.h curlrules.h - -pkgincludedir= $(includedir)/curl - -# curlbuild.h does not exist in the git tree. When the original libcurl -# source code distribution archive file is created, curlbuild.h.dist is -# renamed to curlbuild.h and included in the tarball so that it can be -# used directly on non-configure systems. -# -# The distributed curlbuild.h will be overwritten on configure systems -# when the configure script runs, with one that is suitable and specific -# to the library being configured and built. -# -# curlbuild.h.in is the distributed template file from which the configure -# script creates curlbuild.h at library configuration time, overwiting the -# one included in the distribution archive. -# -# curlbuild.h.dist is not included in the source code distribution archive. - -EXTRA_DIST = curlbuild.h.in - -DISTCLEANFILES = curlbuild.h - diff --git a/src/utils/curl/Makefile.in b/src/utils/curl/Makefile.in deleted file mode 100644 index 58eb07b969168c8cc123e2b696837fce24808eea..0000000000000000000000000000000000000000 --- a/src/utils/curl/Makefile.in +++ /dev/null @@ -1,511 +0,0 @@ -# Makefile.in generated by automake 1.9.6 from Makefile.am. -# @configure_input@ - -# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, -# 2003, 2004, 2005 Free Software Foundation, Inc. -# This Makefile.in is free software; the Free Software Foundation -# gives unlimited permission to copy and/or distribute it, -# with or without modifications, as long as this notice is preserved. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY, to the extent permitted by law; without -# even the implied warranty of MERCHANTABILITY or FITNESS FOR A -# PARTICULAR PURPOSE. - -@SET_MAKE@ - -srcdir = @srcdir@ -top_srcdir = @top_srcdir@ -VPATH = @srcdir@ -pkgdatadir = $(datadir)/@PACKAGE@ -pkglibdir = $(libdir)/@PACKAGE@ -top_builddir = ../.. -am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd -INSTALL = @INSTALL@ -install_sh_DATA = $(install_sh) -c -m 644 -install_sh_PROGRAM = $(install_sh) -c -install_sh_SCRIPT = $(install_sh) -c -INSTALL_HEADER = $(INSTALL_DATA) -transform = $(program_transform_name) -NORMAL_INSTALL = : -PRE_INSTALL = : -POST_INSTALL = : -NORMAL_UNINSTALL = : -PRE_UNINSTALL = : -POST_UNINSTALL = : -build_triplet = @build@ -host_triplet = @host@ -subdir = include/curl -DIST_COMMON = $(pkginclude_HEADERS) $(srcdir)/Makefile.am \ - $(srcdir)/Makefile.in $(srcdir)/curlbuild.h.in -ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 -am__aclocal_m4_deps = $(top_srcdir)/m4/curl-compilers.m4 \ - $(top_srcdir)/m4/curl-confopts.m4 \ - $(top_srcdir)/m4/curl-functions.m4 \ - $(top_srcdir)/m4/curl-override.m4 \ - $(top_srcdir)/m4/curl-reentrant.m4 \ - $(top_srcdir)/m4/curl-system.m4 $(top_srcdir)/m4/libtool.m4 \ - $(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \ - $(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \ - $(top_srcdir)/acinclude.m4 $(top_srcdir)/configure.ac -am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ - $(ACLOCAL_M4) -mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs -CONFIG_HEADER = $(top_builddir)/lib/curl_config.h \ - $(top_builddir)/src/curl_config.h curlbuild.h -CONFIG_CLEAN_FILES = -SOURCES = -DIST_SOURCES = -am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; -am__vpath_adj = case $$p in \ - $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \ - *) f=$$p;; \ - esac; -am__strip_dir = `echo $$p | sed -e 's|^.*/||'`; -am__installdirs = "$(DESTDIR)$(pkgincludedir)" -pkgincludeHEADERS_INSTALL = $(INSTALL_HEADER) -HEADERS = $(pkginclude_HEADERS) -ETAGS = etags -CTAGS = ctags -DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) -pkgincludedir = $(includedir)/curl -ACLOCAL = @ACLOCAL@ -AMDEP_FALSE = @AMDEP_FALSE@ -AMDEP_TRUE = @AMDEP_TRUE@ -AMTAR = @AMTAR@ -AR = @AR@ -AS = @AS@ -AUTOCONF = @AUTOCONF@ -AUTOHEADER = @AUTOHEADER@ -AUTOMAKE = @AUTOMAKE@ -AWK = @AWK@ -CC = @CC@ -CCDEPMODE = @CCDEPMODE@ -CFLAGS = @CFLAGS@ -CONFIGURE_OPTIONS = @CONFIGURE_OPTIONS@ -CPP = @CPP@ -CPPFLAGS = @CPPFLAGS@ -CROSSCOMPILING_FALSE = @CROSSCOMPILING_FALSE@ -CROSSCOMPILING_TRUE = @CROSSCOMPILING_TRUE@ -CURLDEBUG_FALSE = @CURLDEBUG_FALSE@ -CURLDEBUG_TRUE = @CURLDEBUG_TRUE@ -CURL_CA_BUNDLE = @CURL_CA_BUNDLE@ -CURL_DISABLE_DICT = @CURL_DISABLE_DICT@ -CURL_DISABLE_FILE = @CURL_DISABLE_FILE@ -CURL_DISABLE_FTP = @CURL_DISABLE_FTP@ -CURL_DISABLE_HTTP = @CURL_DISABLE_HTTP@ -CURL_DISABLE_IMAP = @CURL_DISABLE_IMAP@ -CURL_DISABLE_LDAP = @CURL_DISABLE_LDAP@ -CURL_DISABLE_LDAPS = @CURL_DISABLE_LDAPS@ -CURL_DISABLE_POP3 = @CURL_DISABLE_POP3@ -CURL_DISABLE_PROXY = @CURL_DISABLE_PROXY@ -CURL_DISABLE_RTSP = @CURL_DISABLE_RTSP@ -CURL_DISABLE_SMTP = @CURL_DISABLE_SMTP@ -CURL_DISABLE_TELNET = @CURL_DISABLE_TELNET@ -CURL_DISABLE_TFTP = @CURL_DISABLE_TFTP@ -CURL_LIBS = @CURL_LIBS@ -CYGPATH_W = @CYGPATH_W@ -DEFS = @DEFS@ -DEPDIR = @DEPDIR@ -DLLTOOL = @DLLTOOL@ -DSYMUTIL = @DSYMUTIL@ -DUMPBIN = @DUMPBIN@ -ECHO_C = @ECHO_C@ -ECHO_N = @ECHO_N@ -ECHO_T = @ECHO_T@ -EGREP = @EGREP@ -EXEEXT = @EXEEXT@ -FGREP = @FGREP@ -GREP = @GREP@ -HAVE_LIBZ = @HAVE_LIBZ@ -HAVE_LIBZ_FALSE = @HAVE_LIBZ_FALSE@ -HAVE_LIBZ_TRUE = @HAVE_LIBZ_TRUE@ -HAVE_PK11_CREATEGENERICOBJECT = @HAVE_PK11_CREATEGENERICOBJECT@ -IDN_ENABLED = @IDN_ENABLED@ -INSTALL_DATA = @INSTALL_DATA@ -INSTALL_PROGRAM = @INSTALL_PROGRAM@ -INSTALL_SCRIPT = @INSTALL_SCRIPT@ -INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ -IPV6_ENABLED = @IPV6_ENABLED@ -KRB4_ENABLED = @KRB4_ENABLED@ -LD = @LD@ -LDFLAGS = @LDFLAGS@ -LIBCURL_LIBS = @LIBCURL_LIBS@ -LIBOBJS = @LIBOBJS@ -LIBS = @LIBS@ -LIBTOOL = @LIBTOOL@ -LIPO = @LIPO@ -LN_S = @LN_S@ -LTLIBOBJS = @LTLIBOBJS@ -MAINT = @MAINT@ -MAINTAINER_MODE_FALSE = @MAINTAINER_MODE_FALSE@ -MAINTAINER_MODE_TRUE = @MAINTAINER_MODE_TRUE@ -MAKEINFO = @MAKEINFO@ -MANOPT = @MANOPT@ -MIMPURE_FALSE = @MIMPURE_FALSE@ -MIMPURE_TRUE = @MIMPURE_TRUE@ -NM = @NM@ -NMEDIT = @NMEDIT@ -NO_UNDEFINED_FALSE = @NO_UNDEFINED_FALSE@ -NO_UNDEFINED_TRUE = @NO_UNDEFINED_TRUE@ -NROFF = @NROFF@ -OBJDUMP = @OBJDUMP@ -OBJEXT = @OBJEXT@ -OTOOL = @OTOOL@ -OTOOL64 = @OTOOL64@ -PACKAGE = @PACKAGE@ -PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ -PACKAGE_NAME = @PACKAGE_NAME@ -PACKAGE_STRING = @PACKAGE_STRING@ -PACKAGE_TARNAME = @PACKAGE_TARNAME@ -PACKAGE_URL = @PACKAGE_URL@ -PACKAGE_VERSION = @PACKAGE_VERSION@ -PATH = @PATH@ -PATH_SEPARATOR = @PATH_SEPARATOR@ -PERL = @PERL@ -PKGADD_NAME = @PKGADD_NAME@ -PKGADD_PKG = @PKGADD_PKG@ -PKGADD_VENDOR = @PKGADD_VENDOR@ -PKGCONFIG = @PKGCONFIG@ -RANDOM_FILE = @RANDOM_FILE@ -RANLIB = @RANLIB@ -REQUIRE_LIB_DEPS = @REQUIRE_LIB_DEPS@ -SED = @SED@ -SET_MAKE = @SET_MAKE@ -SHELL = @SHELL@ -SONAME_BUMP_FALSE = @SONAME_BUMP_FALSE@ -SONAME_BUMP_TRUE = @SONAME_BUMP_TRUE@ -SSL_ENABLED = @SSL_ENABLED@ -STATICLIB_FALSE = @STATICLIB_FALSE@ -STATICLIB_TRUE = @STATICLIB_TRUE@ -STRIP = @STRIP@ -SUPPORT_FEATURES = @SUPPORT_FEATURES@ -SUPPORT_PROTOCOLS = @SUPPORT_PROTOCOLS@ -TEST_SERVER_LIBS = @TEST_SERVER_LIBS@ -USE_ARES = @USE_ARES@ -USE_EMBEDDED_ARES_FALSE = @USE_EMBEDDED_ARES_FALSE@ -USE_EMBEDDED_ARES_TRUE = @USE_EMBEDDED_ARES_TRUE@ -USE_GNUTLS = @USE_GNUTLS@ -USE_LIBSSH2 = @USE_LIBSSH2@ -USE_MANUAL_FALSE = @USE_MANUAL_FALSE@ -USE_MANUAL_TRUE = @USE_MANUAL_TRUE@ -USE_NSS = @USE_NSS@ -USE_SSLEAY = @USE_SSLEAY@ -USE_WINDOWS_SSPI = @USE_WINDOWS_SSPI@ -VERSION = @VERSION@ -VERSIONNUM = @VERSIONNUM@ -ac_ct_CC = @ac_ct_CC@ -ac_ct_DUMPBIN = @ac_ct_DUMPBIN@ -am__fastdepCC_FALSE = @am__fastdepCC_FALSE@ -am__fastdepCC_TRUE = @am__fastdepCC_TRUE@ -am__include = @am__include@ -am__leading_dot = @am__leading_dot@ -am__quote = @am__quote@ -am__tar = @am__tar@ -am__untar = @am__untar@ -bindir = @bindir@ -build = @build@ -build_alias = @build_alias@ -build_cpu = @build_cpu@ -build_os = @build_os@ -build_vendor = @build_vendor@ -datadir = @datadir@ -datarootdir = @datarootdir@ -docdir = @docdir@ -dvidir = @dvidir@ -exec_prefix = @exec_prefix@ -host = @host@ -host_alias = @host_alias@ -host_cpu = @host_cpu@ -host_os = @host_os@ -host_vendor = @host_vendor@ -htmldir = @htmldir@ -includedir = @includedir@ -infodir = @infodir@ -install_sh = @install_sh@ -libdir = @libdir@ -libexecdir = @libexecdir@ -libext = @libext@ -localedir = @localedir@ -localstatedir = @localstatedir@ -lt_ECHO = @lt_ECHO@ -mandir = @mandir@ -mkdir_p = @mkdir_p@ -oldincludedir = @oldincludedir@ -pdfdir = @pdfdir@ -prefix = @prefix@ -program_transform_name = @program_transform_name@ -psdir = @psdir@ -sbindir = @sbindir@ -sharedstatedir = @sharedstatedir@ -subdirs = @subdirs@ -sysconfdir = @sysconfdir@ -target_alias = @target_alias@ -pkginclude_HEADERS = \ - curl.h curlver.h easy.h mprintf.h stdcheaders.h types.h multi.h \ - typecheck-gcc.h curlbuild.h curlrules.h - - -# curlbuild.h does not exist in the git tree. When the original libcurl -# source code distribution archive file is created, curlbuild.h.dist is -# renamed to curlbuild.h and included in the tarball so that it can be -# used directly on non-configure systems. -# -# The distributed curlbuild.h will be overwritten on configure systems -# when the configure script runs, with one that is suitable and specific -# to the library being configured and built. -# -# curlbuild.h.in is the distributed template file from which the configure -# script creates curlbuild.h at library configuration time, overwiting the -# one included in the distribution archive. -# -# curlbuild.h.dist is not included in the source code distribution archive. -EXTRA_DIST = curlbuild.h.in -DISTCLEANFILES = curlbuild.h -all: curlbuild.h - $(MAKE) $(AM_MAKEFLAGS) all-am - -.SUFFIXES: -$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(am__configure_deps) - @for dep in $?; do \ - case '$(am__configure_deps)' in \ - *$$dep*) \ - cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh \ - && exit 0; \ - exit 1;; \ - esac; \ - done; \ - echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign include/curl/Makefile'; \ - cd $(top_srcdir) && \ - $(AUTOMAKE) --foreign include/curl/Makefile -.PRECIOUS: Makefile -Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status - @case '$?' in \ - *config.status*) \ - cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ - *) \ - echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ - cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ - esac; - -$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) - cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh - -$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps) - cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh -$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps) - cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh - -curlbuild.h: stamp-h3 - @if test ! -f $@; then \ - rm -f stamp-h3; \ - $(MAKE) stamp-h3; \ - else :; fi - -stamp-h3: $(srcdir)/curlbuild.h.in $(top_builddir)/config.status - @rm -f stamp-h3 - cd $(top_builddir) && $(SHELL) ./config.status include/curl/curlbuild.h -$(srcdir)/curlbuild.h.in: @MAINTAINER_MODE_TRUE@ $(am__configure_deps) - cd $(top_srcdir) && $(AUTOHEADER) - rm -f stamp-h3 - touch $@ - -distclean-hdr: - -rm -f curlbuild.h stamp-h3 - -mostlyclean-libtool: - -rm -f *.lo - -clean-libtool: - -rm -rf .libs _libs - -distclean-libtool: - -rm -f libtool -uninstall-info-am: -install-pkgincludeHEADERS: $(pkginclude_HEADERS) - @$(NORMAL_INSTALL) - test -z "$(pkgincludedir)" || $(mkdir_p) "$(DESTDIR)$(pkgincludedir)" - @list='$(pkginclude_HEADERS)'; for p in $$list; do \ - if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \ - f=$(am__strip_dir) \ - echo " $(pkgincludeHEADERS_INSTALL) '$$d$$p' '$(DESTDIR)$(pkgincludedir)/$$f'"; \ - $(pkgincludeHEADERS_INSTALL) "$$d$$p" "$(DESTDIR)$(pkgincludedir)/$$f"; \ - done - -uninstall-pkgincludeHEADERS: - @$(NORMAL_UNINSTALL) - @list='$(pkginclude_HEADERS)'; for p in $$list; do \ - f=$(am__strip_dir) \ - echo " rm -f '$(DESTDIR)$(pkgincludedir)/$$f'"; \ - rm -f "$(DESTDIR)$(pkgincludedir)/$$f"; \ - done - -ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) - list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ - unique=`for i in $$list; do \ - if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ - done | \ - $(AWK) ' { files[$$0] = 1; } \ - END { for (i in files) print i; }'`; \ - mkid -fID $$unique -tags: TAGS - -TAGS: $(HEADERS) $(SOURCES) curlbuild.h.in $(TAGS_DEPENDENCIES) \ - $(TAGS_FILES) $(LISP) - tags=; \ - here=`pwd`; \ - list='$(SOURCES) $(HEADERS) curlbuild.h.in $(LISP) $(TAGS_FILES)'; \ - unique=`for i in $$list; do \ - if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ - done | \ - $(AWK) ' { files[$$0] = 1; } \ - END { for (i in files) print i; }'`; \ - if test -z "$(ETAGS_ARGS)$$tags$$unique"; then :; else \ - test -n "$$unique" || unique=$$empty_fix; \ - $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ - $$tags $$unique; \ - fi -ctags: CTAGS -CTAGS: $(HEADERS) $(SOURCES) curlbuild.h.in $(TAGS_DEPENDENCIES) \ - $(TAGS_FILES) $(LISP) - tags=; \ - here=`pwd`; \ - list='$(SOURCES) $(HEADERS) curlbuild.h.in $(LISP) $(TAGS_FILES)'; \ - unique=`for i in $$list; do \ - if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ - done | \ - $(AWK) ' { files[$$0] = 1; } \ - END { for (i in files) print i; }'`; \ - test -z "$(CTAGS_ARGS)$$tags$$unique" \ - || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ - $$tags $$unique - -GTAGS: - here=`$(am__cd) $(top_builddir) && pwd` \ - && cd $(top_srcdir) \ - && gtags -i $(GTAGS_ARGS) $$here - -distclean-tags: - -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags - -distdir: $(DISTFILES) - @srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; \ - topsrcdirstrip=`echo "$(top_srcdir)" | sed 's|.|.|g'`; \ - list='$(DISTFILES)'; for file in $$list; do \ - case $$file in \ - $(srcdir)/*) file=`echo "$$file" | sed "s|^$$srcdirstrip/||"`;; \ - $(top_srcdir)/*) file=`echo "$$file" | sed "s|^$$topsrcdirstrip/|$(top_builddir)/|"`;; \ - esac; \ - if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ - dir=`echo "$$file" | sed -e 's,/[^/]*$$,,'`; \ - if test "$$dir" != "$$file" && test "$$dir" != "."; then \ - dir="/$$dir"; \ - $(mkdir_p) "$(distdir)$$dir"; \ - else \ - dir=''; \ - fi; \ - if test -d $$d/$$file; then \ - if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ - cp -pR $(srcdir)/$$file $(distdir)$$dir || exit 1; \ - fi; \ - cp -pR $$d/$$file $(distdir)$$dir || exit 1; \ - else \ - test -f $(distdir)/$$file \ - || cp -p $$d/$$file $(distdir)/$$file \ - || exit 1; \ - fi; \ - done -check-am: all-am -check: check-am -all-am: Makefile $(HEADERS) curlbuild.h -installdirs: - for dir in "$(DESTDIR)$(pkgincludedir)"; do \ - test -z "$$dir" || $(mkdir_p) "$$dir"; \ - done -install: install-am -install-exec: install-exec-am -install-data: install-data-am -uninstall: uninstall-am - -install-am: all-am - @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am - -installcheck: installcheck-am -install-strip: - $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ - install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ - `test -z '$(STRIP)' || \ - echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install -mostlyclean-generic: - -clean-generic: - -distclean-generic: - -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) - -test -z "$(DISTCLEANFILES)" || rm -f $(DISTCLEANFILES) - -maintainer-clean-generic: - @echo "This command is intended for maintainers to use" - @echo "it deletes files that may require special tools to rebuild." -clean: clean-am - -clean-am: clean-generic clean-libtool mostlyclean-am - -distclean: distclean-am - -rm -f Makefile -distclean-am: clean-am distclean-generic distclean-hdr \ - distclean-libtool distclean-tags - -dvi: dvi-am - -dvi-am: - -html: html-am - -info: info-am - -info-am: - -install-data-am: install-pkgincludeHEADERS - -install-exec-am: - -install-info: install-info-am - -install-man: - -installcheck-am: - -maintainer-clean: maintainer-clean-am - -rm -f Makefile -maintainer-clean-am: distclean-am maintainer-clean-generic - -mostlyclean: mostlyclean-am - -mostlyclean-am: mostlyclean-generic mostlyclean-libtool - -pdf: pdf-am - -pdf-am: - -ps: ps-am - -ps-am: - -uninstall-am: uninstall-info-am uninstall-pkgincludeHEADERS - -.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \ - clean-libtool ctags distclean distclean-generic distclean-hdr \ - distclean-libtool distclean-tags distdir dvi dvi-am html \ - html-am info info-am install install-am install-data \ - install-data-am install-exec install-exec-am install-info \ - install-info-am install-man install-pkgincludeHEADERS \ - install-strip installcheck installcheck-am installdirs \ - maintainer-clean maintainer-clean-generic mostlyclean \ - mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \ - tags uninstall uninstall-am uninstall-info-am \ - uninstall-pkgincludeHEADERS - -# Tell versions [3.59,3.63) of GNU make to not export all variables. -# Otherwise a system limit (for SysV at least) may be exceeded. -.NOEXPORT: diff --git a/src/utils/curl/curl.h b/src/utils/curl/curl.h deleted file mode 100644 index e63596828a4b1cae4c9d3bee1a02aad1a18af347..0000000000000000000000000000000000000000 --- a/src/utils/curl/curl.h +++ /dev/null @@ -1,1996 +0,0 @@ -#ifndef __CURL_CURL_H -#define __CURL_CURL_H -/*************************************************************************** - * _ _ ____ _ - * Project ___| | | | _ \| | - * / __| | | | |_) | | - * | (__| |_| | _ <| |___ - * \___|\___/|_| \_\_____| - * - * Copyright (C) 1998 - 2010, Daniel Stenberg, <daniel@haxx.se>, et al. - * - * This software is licensed as described in the file COPYING, which - * you should have received as part of this distribution. The terms - * are also available at http://curl.haxx.se/docs/copyright.html. - * - * You may opt to use, copy, modify, merge, publish, distribute and/or sell - * copies of the Software, and permit persons to whom the Software is - * furnished to do so, under the terms of the COPYING file. - * - * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY - * KIND, either express or implied. - * - ***************************************************************************/ - -/* - * If you have libcurl problems, all docs and details are found here: - * http://curl.haxx.se/libcurl/ - * - * curl-library mailing list subscription and unsubscription web interface: - * http://cool.haxx.se/mailman/listinfo/curl-library/ - */ - -#include "curlver.h" /* libcurl version defines */ -#include "curlbuild.h" /* libcurl build definitions */ -#include "curlrules.h" /* libcurl rules enforcement */ - -/* - * Define WIN32 when build target is Win32 API - */ - -#if (defined(_WIN32) || defined(__WIN32__)) && \ - !defined(WIN32) && !defined(__SYMBIAN32__) -#define WIN32 -#endif - -#include <stdio.h> -#include <limits.h> - -#if defined(__FreeBSD__) && (__FreeBSD__ >= 2) -/* Needed for __FreeBSD_version symbol definition */ -#include <osreldate.h> -#endif - -/* The include stuff here below is mainly for time_t! */ -#include <sys/types.h> -#include <time.h> - -#if defined(WIN32) && !defined(_WIN32_WCE) && !defined(__GNUC__) && \ - !defined(__CYGWIN__) || defined(__MINGW32__) -#if !(defined(_WINSOCKAPI_) || defined(_WINSOCK_H)) -/* The check above prevents the winsock2 inclusion if winsock.h already was - included, since they can't co-exist without problems */ -#include <winsock2.h> -#include <ws2tcpip.h> -#endif -#else - -/* HP-UX systems version 9, 10 and 11 lack sys/select.h and so does oldish - libc5-based Linux systems. Only include it on system that are known to - require it! */ -#if defined(_AIX) || defined(__NOVELL_LIBC__) || defined(__NetBSD__) || \ - defined(__minix) || defined(__SYMBIAN32__) || defined(__INTEGRITY) || \ - defined(ANDROID) || \ - (defined(__FreeBSD_version) && (__FreeBSD_version < 800000)) -#include <sys/select.h> -#endif - -#ifndef _WIN32_WCE -#include <sys/socket.h> -#endif -#if !defined(WIN32) && !defined(__WATCOMC__) && !defined(__VXWORKS__) -#include <sys/time.h> -#endif -#include <sys/types.h> -#endif - -#ifdef __BEOS__ -#include <support/SupportDefs.h> -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -typedef void CURL; - -/* - * Decorate exportable functions for Win32 and Symbian OS DLL linking. - * This avoids using a .def file for building libcurl.dll. - */ -#if (defined(WIN32) || defined(_WIN32) || defined(__SYMBIAN32__)) && \ - !defined(CURL_STATICLIB) -#if defined(BUILDING_LIBCURL) -#define CURL_EXTERN __declspec(dllexport) -#else -#define CURL_EXTERN __declspec(dllimport) -#endif -#else - -#ifdef CURL_HIDDEN_SYMBOLS -/* - * This definition is used to make external definitions visible in the - * shared library when symbols are hidden by default. It makes no - * difference when compiling applications whether this is set or not, - * only when compiling the library. - */ -#define CURL_EXTERN CURL_EXTERN_SYMBOL -#else -#define CURL_EXTERN -#endif -#endif - -#ifndef curl_socket_typedef -/* socket typedef */ -#ifdef WIN32 -typedef SOCKET curl_socket_t; -#define CURL_SOCKET_BAD INVALID_SOCKET -#else -typedef int curl_socket_t; -#define CURL_SOCKET_BAD -1 -#endif -#define curl_socket_typedef -#endif /* curl_socket_typedef */ - -struct curl_httppost { - struct curl_httppost *next; /* next entry in the list */ - char *name; /* pointer to allocated name */ - long namelength; /* length of name length */ - char *contents; /* pointer to allocated data contents */ - long contentslength; /* length of contents field */ - char *buffer; /* pointer to allocated buffer contents */ - long bufferlength; /* length of buffer field */ - char *contenttype; /* Content-Type */ - struct curl_slist* contentheader; /* list of extra headers for this form */ - struct curl_httppost *more; /* if one field name has more than one - file, this link should link to following - files */ - long flags; /* as defined below */ -#define HTTPPOST_FILENAME (1<<0) /* specified content is a file name */ -#define HTTPPOST_READFILE (1<<1) /* specified content is a file name */ -#define HTTPPOST_PTRNAME (1<<2) /* name is only stored pointer - do not free in formfree */ -#define HTTPPOST_PTRCONTENTS (1<<3) /* contents is only stored pointer - do not free in formfree */ -#define HTTPPOST_BUFFER (1<<4) /* upload file from buffer */ -#define HTTPPOST_PTRBUFFER (1<<5) /* upload file from pointer contents */ -#define HTTPPOST_CALLBACK (1<<6) /* upload file contents by using the - regular read callback to get the data - and pass the given pointer as custom - pointer */ - - char *showfilename; /* The file name to show. If not set, the - actual file name will be used (if this - is a file part) */ - void *userp; /* custom pointer used for - HTTPPOST_CALLBACK posts */ -}; - -typedef int (*curl_progress_callback)(void *clientp, - double dltotal, - double dlnow, - double ultotal, - double ulnow); - -#ifndef CURL_MAX_WRITE_SIZE - /* Tests have proven that 20K is a very bad buffer size for uploads on - Windows, while 16K for some odd reason performed a lot better. - We do the ifndef check to allow this value to easier be changed at build - time for those who feel adventurous. The practical minimum is about - 400 bytes since libcurl uses a buffer of this size as a scratch area - (unrelated to network send operations). */ -#define CURL_MAX_WRITE_SIZE 16384 -#endif - -#ifndef CURL_MAX_HTTP_HEADER -/* The only reason to have a max limit for this is to avoid the risk of a bad - server feeding libcurl with a never-ending header that will cause reallocs - infinitely */ -#define CURL_MAX_HTTP_HEADER (100*1024) -#endif - - -/* This is a magic return code for the write callback that, when returned, - will signal libcurl to pause receiving on the current transfer. */ -#define CURL_WRITEFUNC_PAUSE 0x10000001 -typedef size_t (*curl_write_callback)(char *buffer, - size_t size, - size_t nitems, - void *outstream); - -/* These are the return codes for the seek callbacks */ -#define CURL_SEEKFUNC_OK 0 -#define CURL_SEEKFUNC_FAIL 1 /* fail the entire transfer */ -#define CURL_SEEKFUNC_CANTSEEK 2 /* tell libcurl seeking can't be done, so - libcurl might try other means instead */ -typedef int (*curl_seek_callback)(void *instream, - curl_off_t offset, - int origin); /* 'whence' */ - -/* This is a return code for the read callback that, when returned, will - signal libcurl to immediately abort the current transfer. */ -#define CURL_READFUNC_ABORT 0x10000000 -/* This is a return code for the read callback that, when returned, will - signal libcurl to pause sending data on the current transfer. */ -#define CURL_READFUNC_PAUSE 0x10000001 - -typedef size_t (*curl_read_callback)(char *buffer, - size_t size, - size_t nitems, - void *instream); - -typedef enum { - CURLSOCKTYPE_IPCXN, /* socket created for a specific IP connection */ - CURLSOCKTYPE_LAST /* never use */ -} curlsocktype; - -typedef int (*curl_sockopt_callback)(void *clientp, - curl_socket_t curlfd, - curlsocktype purpose); - -struct curl_sockaddr { - int family; - int socktype; - int protocol; - unsigned int addrlen; /* addrlen was a socklen_t type before 7.18.0 but it - turned really ugly and painful on the systems that - lack this type */ - struct sockaddr addr; -}; - -typedef curl_socket_t -(*curl_opensocket_callback)(void *clientp, - curlsocktype purpose, - struct curl_sockaddr *address); - -typedef enum { - CURLIOE_OK, /* I/O operation successful */ - CURLIOE_UNKNOWNCMD, /* command was unknown to callback */ - CURLIOE_FAILRESTART, /* failed to restart the read */ - CURLIOE_LAST /* never use */ -} curlioerr; - -typedef enum { - CURLIOCMD_NOP, /* no operation */ - CURLIOCMD_RESTARTREAD, /* restart the read stream from start */ - CURLIOCMD_LAST /* never use */ -} curliocmd; - -typedef curlioerr (*curl_ioctl_callback)(CURL *handle, - int cmd, - void *clientp); - -/* - * The following typedef's are signatures of malloc, free, realloc, strdup and - * calloc respectively. Function pointers of these types can be passed to the - * curl_global_init_mem() function to set user defined memory management - * callback routines. - */ -typedef void *(*curl_malloc_callback)(size_t size); -typedef void (*curl_free_callback)(void *ptr); -typedef void *(*curl_realloc_callback)(void *ptr, size_t size); -typedef char *(*curl_strdup_callback)(const char *str); -typedef void *(*curl_calloc_callback)(size_t nmemb, size_t size); - -/* the kind of data that is passed to information_callback*/ -typedef enum { - CURLINFO_TEXT = 0, - CURLINFO_HEADER_IN, /* 1 */ - CURLINFO_HEADER_OUT, /* 2 */ - CURLINFO_DATA_IN, /* 3 */ - CURLINFO_DATA_OUT, /* 4 */ - CURLINFO_SSL_DATA_IN, /* 5 */ - CURLINFO_SSL_DATA_OUT, /* 6 */ - CURLINFO_END -} curl_infotype; - -typedef int (*curl_debug_callback) - (CURL *handle, /* the handle/transfer this concerns */ - curl_infotype type, /* what kind of data */ - char *data, /* points to the data */ - size_t size, /* size of the data pointed to */ - void *userptr); /* whatever the user please */ - -/* All possible error codes from all sorts of curl functions. Future versions - may return other values, stay prepared. - - Always add new return codes last. Never *EVER* remove any. The return - codes must remain the same! - */ - -typedef enum { - CURLE_OK = 0, - CURLE_UNSUPPORTED_PROTOCOL, /* 1 */ - CURLE_FAILED_INIT, /* 2 */ - CURLE_URL_MALFORMAT, /* 3 */ - CURLE_OBSOLETE4, /* 4 - NOT USED */ - CURLE_COULDNT_RESOLVE_PROXY, /* 5 */ - CURLE_COULDNT_RESOLVE_HOST, /* 6 */ - CURLE_COULDNT_CONNECT, /* 7 */ - CURLE_FTP_WEIRD_SERVER_REPLY, /* 8 */ - CURLE_REMOTE_ACCESS_DENIED, /* 9 a service was denied by the server - due to lack of access - when login fails - this is not returned. */ - CURLE_OBSOLETE10, /* 10 - NOT USED */ - CURLE_FTP_WEIRD_PASS_REPLY, /* 11 */ - CURLE_OBSOLETE12, /* 12 - NOT USED */ - CURLE_FTP_WEIRD_PASV_REPLY, /* 13 */ - CURLE_FTP_WEIRD_227_FORMAT, /* 14 */ - CURLE_FTP_CANT_GET_HOST, /* 15 */ - CURLE_OBSOLETE16, /* 16 - NOT USED */ - CURLE_FTP_COULDNT_SET_TYPE, /* 17 */ - CURLE_PARTIAL_FILE, /* 18 */ - CURLE_FTP_COULDNT_RETR_FILE, /* 19 */ - CURLE_OBSOLETE20, /* 20 - NOT USED */ - CURLE_QUOTE_ERROR, /* 21 - quote command failure */ - CURLE_HTTP_RETURNED_ERROR, /* 22 */ - CURLE_WRITE_ERROR, /* 23 */ - CURLE_OBSOLETE24, /* 24 - NOT USED */ - CURLE_UPLOAD_FAILED, /* 25 - failed upload "command" */ - CURLE_READ_ERROR, /* 26 - couldn't open/read from file */ - CURLE_OUT_OF_MEMORY, /* 27 */ - /* Note: CURLE_OUT_OF_MEMORY may sometimes indicate a conversion error - instead of a memory allocation error if CURL_DOES_CONVERSIONS - is defined - */ - CURLE_OPERATION_TIMEDOUT, /* 28 - the timeout time was reached */ - CURLE_OBSOLETE29, /* 29 - NOT USED */ - CURLE_FTP_PORT_FAILED, /* 30 - FTP PORT operation failed */ - CURLE_FTP_COULDNT_USE_REST, /* 31 - the REST command failed */ - CURLE_OBSOLETE32, /* 32 - NOT USED */ - CURLE_RANGE_ERROR, /* 33 - RANGE "command" didn't work */ - CURLE_HTTP_POST_ERROR, /* 34 */ - CURLE_SSL_CONNECT_ERROR, /* 35 - wrong when connecting with SSL */ - CURLE_BAD_DOWNLOAD_RESUME, /* 36 - couldn't resume download */ - CURLE_FILE_COULDNT_READ_FILE, /* 37 */ - CURLE_LDAP_CANNOT_BIND, /* 38 */ - CURLE_LDAP_SEARCH_FAILED, /* 39 */ - CURLE_OBSOLETE40, /* 40 - NOT USED */ - CURLE_FUNCTION_NOT_FOUND, /* 41 */ - CURLE_ABORTED_BY_CALLBACK, /* 42 */ - CURLE_BAD_FUNCTION_ARGUMENT, /* 43 */ - CURLE_OBSOLETE44, /* 44 - NOT USED */ - CURLE_INTERFACE_FAILED, /* 45 - CURLOPT_INTERFACE failed */ - CURLE_OBSOLETE46, /* 46 - NOT USED */ - CURLE_TOO_MANY_REDIRECTS , /* 47 - catch endless re-direct loops */ - CURLE_UNKNOWN_TELNET_OPTION, /* 48 - User specified an unknown option */ - CURLE_TELNET_OPTION_SYNTAX , /* 49 - Malformed telnet option */ - CURLE_OBSOLETE50, /* 50 - NOT USED */ - CURLE_PEER_FAILED_VERIFICATION, /* 51 - peer's certificate or fingerprint - wasn't verified fine */ - CURLE_GOT_NOTHING, /* 52 - when this is a specific error */ - CURLE_SSL_ENGINE_NOTFOUND, /* 53 - SSL crypto engine not found */ - CURLE_SSL_ENGINE_SETFAILED, /* 54 - can not set SSL crypto engine as - default */ - CURLE_SEND_ERROR, /* 55 - failed sending network data */ - CURLE_RECV_ERROR, /* 56 - failure in receiving network data */ - CURLE_OBSOLETE57, /* 57 - NOT IN USE */ - CURLE_SSL_CERTPROBLEM, /* 58 - problem with the local certificate */ - CURLE_SSL_CIPHER, /* 59 - couldn't use specified cipher */ - CURLE_SSL_CACERT, /* 60 - problem with the CA cert (path?) */ - CURLE_BAD_CONTENT_ENCODING, /* 61 - Unrecognized transfer encoding */ - CURLE_LDAP_INVALID_URL, /* 62 - Invalid LDAP URL */ - CURLE_FILESIZE_EXCEEDED, /* 63 - Maximum file size exceeded */ - CURLE_USE_SSL_FAILED, /* 64 - Requested FTP SSL level failed */ - CURLE_SEND_FAIL_REWIND, /* 65 - Sending the data requires a rewind - that failed */ - CURLE_SSL_ENGINE_INITFAILED, /* 66 - failed to initialise ENGINE */ - CURLE_LOGIN_DENIED, /* 67 - user, password or similar was not - accepted and we failed to login */ - CURLE_TFTP_NOTFOUND, /* 68 - file not found on server */ - CURLE_TFTP_PERM, /* 69 - permission problem on server */ - CURLE_REMOTE_DISK_FULL, /* 70 - out of disk space on server */ - CURLE_TFTP_ILLEGAL, /* 71 - Illegal TFTP operation */ - CURLE_TFTP_UNKNOWNID, /* 72 - Unknown transfer ID */ - CURLE_REMOTE_FILE_EXISTS, /* 73 - File already exists */ - CURLE_TFTP_NOSUCHUSER, /* 74 - No such user */ - CURLE_CONV_FAILED, /* 75 - conversion failed */ - CURLE_CONV_REQD, /* 76 - caller must register conversion - callbacks using curl_easy_setopt options - CURLOPT_CONV_FROM_NETWORK_FUNCTION, - CURLOPT_CONV_TO_NETWORK_FUNCTION, and - CURLOPT_CONV_FROM_UTF8_FUNCTION */ - CURLE_SSL_CACERT_BADFILE, /* 77 - could not load CACERT file, missing - or wrong format */ - CURLE_REMOTE_FILE_NOT_FOUND, /* 78 - remote file not found */ - CURLE_SSH, /* 79 - error from the SSH layer, somewhat - generic so the error message will be of - interest when this has happened */ - - CURLE_SSL_SHUTDOWN_FAILED, /* 80 - Failed to shut down the SSL - connection */ - CURLE_AGAIN, /* 81 - socket is not ready for send/recv, - wait till it's ready and try again (Added - in 7.18.2) */ - CURLE_SSL_CRL_BADFILE, /* 82 - could not load CRL file, missing or - wrong format (Added in 7.19.0) */ - CURLE_SSL_ISSUER_ERROR, /* 83 - Issuer check failed. (Added in - 7.19.0) */ - CURLE_FTP_PRET_FAILED, /* 84 - a PRET command failed */ - CURLE_RTSP_CSEQ_ERROR, /* 85 - mismatch of RTSP CSeq numbers */ - CURLE_RTSP_SESSION_ERROR, /* 86 - mismatch of RTSP Session Identifiers */ - - CURL_LAST /* never use! */ -} CURLcode; - -#ifndef CURL_NO_OLDIES /* define this to test if your app builds with all - the obsolete stuff removed! */ - -/* Backwards compatibility with older names */ - -/* The following were added in 7.17.1 */ -/* These are scheduled to disappear by 2009 */ -#define CURLE_SSL_PEER_CERTIFICATE CURLE_PEER_FAILED_VERIFICATION - -/* The following were added in 7.17.0 */ -/* These are scheduled to disappear by 2009 */ -#define CURLE_OBSOLETE CURLE_OBSOLETE50 /* noone should be using this! */ -#define CURLE_BAD_PASSWORD_ENTERED CURLE_OBSOLETE46 -#define CURLE_BAD_CALLING_ORDER CURLE_OBSOLETE44 -#define CURLE_FTP_USER_PASSWORD_INCORRECT CURLE_OBSOLETE10 -#define CURLE_FTP_CANT_RECONNECT CURLE_OBSOLETE16 -#define CURLE_FTP_COULDNT_GET_SIZE CURLE_OBSOLETE32 -#define CURLE_FTP_COULDNT_SET_ASCII CURLE_OBSOLETE29 -#define CURLE_FTP_WEIRD_USER_REPLY CURLE_OBSOLETE12 -#define CURLE_FTP_WRITE_ERROR CURLE_OBSOLETE20 -#define CURLE_LIBRARY_NOT_FOUND CURLE_OBSOLETE40 -#define CURLE_MALFORMAT_USER CURLE_OBSOLETE24 -#define CURLE_SHARE_IN_USE CURLE_OBSOLETE57 -#define CURLE_URL_MALFORMAT_USER CURLE_OBSOLETE4 - -#define CURLE_FTP_ACCESS_DENIED CURLE_REMOTE_ACCESS_DENIED -#define CURLE_FTP_COULDNT_SET_BINARY CURLE_FTP_COULDNT_SET_TYPE -#define CURLE_FTP_QUOTE_ERROR CURLE_QUOTE_ERROR -#define CURLE_TFTP_DISKFULL CURLE_REMOTE_DISK_FULL -#define CURLE_TFTP_EXISTS CURLE_REMOTE_FILE_EXISTS -#define CURLE_HTTP_RANGE_ERROR CURLE_RANGE_ERROR -#define CURLE_FTP_SSL_FAILED CURLE_USE_SSL_FAILED - -/* The following were added earlier */ - -#define CURLE_OPERATION_TIMEOUTED CURLE_OPERATION_TIMEDOUT - -#define CURLE_HTTP_NOT_FOUND CURLE_HTTP_RETURNED_ERROR -#define CURLE_HTTP_PORT_FAILED CURLE_INTERFACE_FAILED -#define CURLE_FTP_COULDNT_STOR_FILE CURLE_UPLOAD_FAILED - -#define CURLE_FTP_PARTIAL_FILE CURLE_PARTIAL_FILE -#define CURLE_FTP_BAD_DOWNLOAD_RESUME CURLE_BAD_DOWNLOAD_RESUME - -/* This was the error code 50 in 7.7.3 and a few earlier versions, this - is no longer used by libcurl but is instead #defined here only to not - make programs break */ -#define CURLE_ALREADY_COMPLETE 99999 - -#endif /*!CURL_NO_OLDIES*/ - -/* This prototype applies to all conversion callbacks */ -typedef CURLcode (*curl_conv_callback)(char *buffer, size_t length); - -typedef CURLcode (*curl_ssl_ctx_callback)(CURL *curl, /* easy handle */ - void *ssl_ctx, /* actually an - OpenSSL SSL_CTX */ - void *userptr); - -typedef enum { - CURLPROXY_HTTP = 0, /* added in 7.10, new in 7.19.4 default is to use - CONNECT HTTP/1.1 */ - CURLPROXY_HTTP_1_0 = 1, /* added in 7.19.4, force to use CONNECT - HTTP/1.0 */ - CURLPROXY_SOCKS4 = 4, /* support added in 7.15.2, enum existed already - in 7.10 */ - CURLPROXY_SOCKS5 = 5, /* added in 7.10 */ - CURLPROXY_SOCKS4A = 6, /* added in 7.18.0 */ - CURLPROXY_SOCKS5_HOSTNAME = 7 /* Use the SOCKS5 protocol but pass along the - host name rather than the IP address. added - in 7.18.0 */ -} curl_proxytype; /* this enum was added in 7.10 */ - -#define CURLAUTH_NONE 0 /* nothing */ -#define CURLAUTH_BASIC (1<<0) /* Basic (default) */ -#define CURLAUTH_DIGEST (1<<1) /* Digest */ -#define CURLAUTH_GSSNEGOTIATE (1<<2) /* GSS-Negotiate */ -#define CURLAUTH_NTLM (1<<3) /* NTLM */ -#define CURLAUTH_DIGEST_IE (1<<4) /* Digest with IE flavour */ -#define CURLAUTH_ANY (~CURLAUTH_DIGEST_IE) /* all fine types set */ -#define CURLAUTH_ANYSAFE (~(CURLAUTH_BASIC|CURLAUTH_DIGEST_IE)) - -#define CURLSSH_AUTH_ANY ~0 /* all types supported by the server */ -#define CURLSSH_AUTH_NONE 0 /* none allowed, silly but complete */ -#define CURLSSH_AUTH_PUBLICKEY (1<<0) /* public/private key files */ -#define CURLSSH_AUTH_PASSWORD (1<<1) /* password */ -#define CURLSSH_AUTH_HOST (1<<2) /* host key files */ -#define CURLSSH_AUTH_KEYBOARD (1<<3) /* keyboard interactive */ -#define CURLSSH_AUTH_DEFAULT CURLSSH_AUTH_ANY - -#define CURL_ERROR_SIZE 256 - -struct curl_khkey { - const char *key; /* points to a zero-terminated string encoded with base64 - if len is zero, otherwise to the "raw" data */ - size_t len; - enum type { - CURLKHTYPE_UNKNOWN, - CURLKHTYPE_RSA1, - CURLKHTYPE_RSA, - CURLKHTYPE_DSS - } keytype; -}; - -/* this is the set of return values expected from the curl_sshkeycallback - callback */ -enum curl_khstat { - CURLKHSTAT_FINE_ADD_TO_FILE, - CURLKHSTAT_FINE, - CURLKHSTAT_REJECT, /* reject the connection, return an error */ - CURLKHSTAT_DEFER, /* do not accept it, but we can't answer right now so - this causes a CURLE_DEFER error but otherwise the - connection will be left intact etc */ - CURLKHSTAT_LAST /* not for use, only a marker for last-in-list */ -}; - -/* this is the set of status codes pass in to the callback */ -enum curl_khmatch { - CURLKHMATCH_OK, /* match */ - CURLKHMATCH_MISMATCH, /* host found, key mismatch! */ - CURLKHMATCH_MISSING, /* no matching host/key found */ - CURLKHMATCH_LAST /* not for use, only a marker for last-in-list */ -}; - -typedef int - (*curl_sshkeycallback) (CURL *easy, /* easy handle */ - const struct curl_khkey *knownkey, /* known */ - const struct curl_khkey *foundkey, /* found */ - enum curl_khmatch, /* libcurl's view on the keys */ - void *clientp); /* custom pointer passed from app */ - -/* parameter for the CURLOPT_USE_SSL option */ -typedef enum { - CURLUSESSL_NONE, /* do not attempt to use SSL */ - CURLUSESSL_TRY, /* try using SSL, proceed anyway otherwise */ - CURLUSESSL_CONTROL, /* SSL for the control connection or fail */ - CURLUSESSL_ALL, /* SSL for all communication or fail */ - CURLUSESSL_LAST /* not an option, never use */ -} curl_usessl; - -#ifndef CURL_NO_OLDIES /* define this to test if your app builds with all - the obsolete stuff removed! */ - -/* Backwards compatibility with older names */ -/* These are scheduled to disappear by 2009 */ - -#define CURLFTPSSL_NONE CURLUSESSL_NONE -#define CURLFTPSSL_TRY CURLUSESSL_TRY -#define CURLFTPSSL_CONTROL CURLUSESSL_CONTROL -#define CURLFTPSSL_ALL CURLUSESSL_ALL -#define CURLFTPSSL_LAST CURLUSESSL_LAST -#define curl_ftpssl curl_usessl -#endif /*!CURL_NO_OLDIES*/ - -/* parameter for the CURLOPT_FTP_SSL_CCC option */ -typedef enum { - CURLFTPSSL_CCC_NONE, /* do not send CCC */ - CURLFTPSSL_CCC_PASSIVE, /* Let the server initiate the shutdown */ - CURLFTPSSL_CCC_ACTIVE, /* Initiate the shutdown */ - CURLFTPSSL_CCC_LAST /* not an option, never use */ -} curl_ftpccc; - -/* parameter for the CURLOPT_FTPSSLAUTH option */ -typedef enum { - CURLFTPAUTH_DEFAULT, /* let libcurl decide */ - CURLFTPAUTH_SSL, /* use "AUTH SSL" */ - CURLFTPAUTH_TLS, /* use "AUTH TLS" */ - CURLFTPAUTH_LAST /* not an option, never use */ -} curl_ftpauth; - -/* parameter for the CURLOPT_FTP_CREATE_MISSING_DIRS option */ -typedef enum { - CURLFTP_CREATE_DIR_NONE, /* do NOT create missing dirs! */ - CURLFTP_CREATE_DIR, /* (FTP/SFTP) if CWD fails, try MKD and then CWD - again if MKD succeeded, for SFTP this does - similar magic */ - CURLFTP_CREATE_DIR_RETRY, /* (FTP only) if CWD fails, try MKD and then CWD - again even if MKD failed! */ - CURLFTP_CREATE_DIR_LAST /* not an option, never use */ -} curl_ftpcreatedir; - -/* parameter for the CURLOPT_FTP_FILEMETHOD option */ -typedef enum { - CURLFTPMETHOD_DEFAULT, /* let libcurl pick */ - CURLFTPMETHOD_MULTICWD, /* single CWD operation for each path part */ - CURLFTPMETHOD_NOCWD, /* no CWD at all */ - CURLFTPMETHOD_SINGLECWD, /* one CWD to full dir, then work on file */ - CURLFTPMETHOD_LAST /* not an option, never use */ -} curl_ftpmethod; - -/* CURLPROTO_ defines are for the CURLOPT_*PROTOCOLS options */ -#define CURLPROTO_HTTP (1<<0) -#define CURLPROTO_HTTPS (1<<1) -#define CURLPROTO_FTP (1<<2) -#define CURLPROTO_FTPS (1<<3) -#define CURLPROTO_SCP (1<<4) -#define CURLPROTO_SFTP (1<<5) -#define CURLPROTO_TELNET (1<<6) -#define CURLPROTO_LDAP (1<<7) -#define CURLPROTO_LDAPS (1<<8) -#define CURLPROTO_DICT (1<<9) -#define CURLPROTO_FILE (1<<10) -#define CURLPROTO_TFTP (1<<11) -#define CURLPROTO_IMAP (1<<12) -#define CURLPROTO_IMAPS (1<<13) -#define CURLPROTO_POP3 (1<<14) -#define CURLPROTO_POP3S (1<<15) -#define CURLPROTO_SMTP (1<<16) -#define CURLPROTO_SMTPS (1<<17) -#define CURLPROTO_RTSP (1<<18) -#define CURLPROTO_ALL (~0) /* enable everything */ - -/* long may be 32 or 64 bits, but we should never depend on anything else - but 32 */ -#define CURLOPTTYPE_LONG 0 -#define CURLOPTTYPE_OBJECTPOINT 10000 -#define CURLOPTTYPE_FUNCTIONPOINT 20000 -#define CURLOPTTYPE_OFF_T 30000 - -/* name is uppercase CURLOPT_<name>, - type is one of the defined CURLOPTTYPE_<type> - number is unique identifier */ -#ifdef CINIT -#undef CINIT -#endif - -#ifdef CURL_ISOCPP -#define CINIT(name,type,number) CURLOPT_ ## name = CURLOPTTYPE_ ## type + number -#else -/* The macro "##" is ISO C, we assume pre-ISO C doesn't support it. */ -#define LONG CURLOPTTYPE_LONG -#define OBJECTPOINT CURLOPTTYPE_OBJECTPOINT -#define FUNCTIONPOINT CURLOPTTYPE_FUNCTIONPOINT -#define OFF_T CURLOPTTYPE_OFF_T -#define CINIT(name,type,number) CURLOPT_/**/name = type + number -#endif - -/* - * This macro-mania below setups the CURLOPT_[what] enum, to be used with - * curl_easy_setopt(). The first argument in the CINIT() macro is the [what] - * word. - */ - -typedef enum { - /* This is the FILE * or void * the regular output should be written to. */ - CINIT(FILE, OBJECTPOINT, 1), - - /* The full URL to get/put */ - CINIT(URL, OBJECTPOINT, 2), - - /* Port number to connect to, if other than default. */ - CINIT(PORT, LONG, 3), - - /* Name of proxy to use. */ - CINIT(PROXY, OBJECTPOINT, 4), - - /* "name:password" to use when fetching. */ - CINIT(USERPWD, OBJECTPOINT, 5), - - /* "name:password" to use with proxy. */ - CINIT(PROXYUSERPWD, OBJECTPOINT, 6), - - /* Range to get, specified as an ASCII string. */ - CINIT(RANGE, OBJECTPOINT, 7), - - /* not used */ - - /* Specified file stream to upload from (use as input): */ - CINIT(INFILE, OBJECTPOINT, 9), - - /* Buffer to receive error messages in, must be at least CURL_ERROR_SIZE - * bytes big. If this is not used, error messages go to stderr instead: */ - CINIT(ERRORBUFFER, OBJECTPOINT, 10), - - /* Function that will be called to store the output (instead of fwrite). The - * parameters will use fwrite() syntax, make sure to follow them. */ - CINIT(WRITEFUNCTION, FUNCTIONPOINT, 11), - - /* Function that will be called to read the input (instead of fread). The - * parameters will use fread() syntax, make sure to follow them. */ - CINIT(READFUNCTION, FUNCTIONPOINT, 12), - - /* Time-out the read operation after this amount of seconds */ - CINIT(TIMEOUT, LONG, 13), - - /* If the CURLOPT_INFILE is used, this can be used to inform libcurl about - * how large the file being sent really is. That allows better error - * checking and better verifies that the upload was successful. -1 means - * unknown size. - * - * For large file support, there is also a _LARGE version of the key - * which takes an off_t type, allowing platforms with larger off_t - * sizes to handle larger files. See below for INFILESIZE_LARGE. - */ - CINIT(INFILESIZE, LONG, 14), - - /* POST static input fields. */ - CINIT(POSTFIELDS, OBJECTPOINT, 15), - - /* Set the referrer page (needed by some CGIs) */ - CINIT(REFERER, OBJECTPOINT, 16), - - /* Set the FTP PORT string (interface name, named or numerical IP address) - Use i.e '-' to use default address. */ - CINIT(FTPPORT, OBJECTPOINT, 17), - - /* Set the User-Agent string (examined by some CGIs) */ - CINIT(USERAGENT, OBJECTPOINT, 18), - - /* If the download receives less than "low speed limit" bytes/second - * during "low speed time" seconds, the operations is aborted. - * You could i.e if you have a pretty high speed connection, abort if - * it is less than 2000 bytes/sec during 20 seconds. - */ - - /* Set the "low speed limit" */ - CINIT(LOW_SPEED_LIMIT, LONG, 19), - - /* Set the "low speed time" */ - CINIT(LOW_SPEED_TIME, LONG, 20), - - /* Set the continuation offset. - * - * Note there is also a _LARGE version of this key which uses - * off_t types, allowing for large file offsets on platforms which - * use larger-than-32-bit off_t's. Look below for RESUME_FROM_LARGE. - */ - CINIT(RESUME_FROM, LONG, 21), - - /* Set cookie in request: */ - CINIT(COOKIE, OBJECTPOINT, 22), - - /* This points to a linked list of headers, struct curl_slist kind */ - CINIT(HTTPHEADER, OBJECTPOINT, 23), - - /* This points to a linked list of post entries, struct curl_httppost */ - CINIT(HTTPPOST, OBJECTPOINT, 24), - - /* name of the file keeping your private SSL-certificate */ - CINIT(SSLCERT, OBJECTPOINT, 25), - - /* password for the SSL or SSH private key */ - CINIT(KEYPASSWD, OBJECTPOINT, 26), - - /* send TYPE parameter? */ - CINIT(CRLF, LONG, 27), - - /* send linked-list of QUOTE commands */ - CINIT(QUOTE, OBJECTPOINT, 28), - - /* send FILE * or void * to store headers to, if you use a callback it - is simply passed to the callback unmodified */ - CINIT(WRITEHEADER, OBJECTPOINT, 29), - - /* point to a file to read the initial cookies from, also enables - "cookie awareness" */ - CINIT(COOKIEFILE, OBJECTPOINT, 31), - - /* What version to specifically try to use. - See CURL_SSLVERSION defines below. */ - CINIT(SSLVERSION, LONG, 32), - - /* What kind of HTTP time condition to use, see defines */ - CINIT(TIMECONDITION, LONG, 33), - - /* Time to use with the above condition. Specified in number of seconds - since 1 Jan 1970 */ - CINIT(TIMEVALUE, LONG, 34), - - /* 35 = OBSOLETE */ - - /* Custom request, for customizing the get command like - HTTP: DELETE, TRACE and others - FTP: to use a different list command - */ - CINIT(CUSTOMREQUEST, OBJECTPOINT, 36), - - /* HTTP request, for odd commands like DELETE, TRACE and others */ - CINIT(STDERR, OBJECTPOINT, 37), - - /* 38 is not used */ - - /* send linked-list of post-transfer QUOTE commands */ - CINIT(POSTQUOTE, OBJECTPOINT, 39), - - /* Pass a pointer to string of the output using full variable-replacement - as described elsewhere. */ - CINIT(WRITEINFO, OBJECTPOINT, 40), - - CINIT(VERBOSE, LONG, 41), /* talk a lot */ - CINIT(HEADER, LONG, 42), /* throw the header out too */ - CINIT(NOPROGRESS, LONG, 43), /* shut off the progress meter */ - CINIT(NOBODY, LONG, 44), /* use HEAD to get http document */ - CINIT(FAILONERROR, LONG, 45), /* no output on http error codes >= 300 */ - CINIT(UPLOAD, LONG, 46), /* this is an upload */ - CINIT(POST, LONG, 47), /* HTTP POST method */ - CINIT(DIRLISTONLY, LONG, 48), /* return bare names when listing directories */ - - CINIT(APPEND, LONG, 50), /* Append instead of overwrite on upload! */ - - /* Specify whether to read the user+password from the .netrc or the URL. - * This must be one of the CURL_NETRC_* enums below. */ - CINIT(NETRC, LONG, 51), - - CINIT(FOLLOWLOCATION, LONG, 52), /* use Location: Luke! */ - - CINIT(TRANSFERTEXT, LONG, 53), /* transfer data in text/ASCII format */ - CINIT(PUT, LONG, 54), /* HTTP PUT */ - - /* 55 = OBSOLETE */ - - /* Function that will be called instead of the internal progress display - * function. This function should be defined as the curl_progress_callback - * prototype defines. */ - CINIT(PROGRESSFUNCTION, FUNCTIONPOINT, 56), - - /* Data passed to the progress callback */ - CINIT(PROGRESSDATA, OBJECTPOINT, 57), - - /* We want the referrer field set automatically when following locations */ - CINIT(AUTOREFERER, LONG, 58), - - /* Port of the proxy, can be set in the proxy string as well with: - "[host]:[port]" */ - CINIT(PROXYPORT, LONG, 59), - - /* size of the POST input data, if strlen() is not good to use */ - CINIT(POSTFIELDSIZE, LONG, 60), - - /* tunnel non-http operations through a HTTP proxy */ - CINIT(HTTPPROXYTUNNEL, LONG, 61), - - /* Set the interface string to use as outgoing network interface */ - CINIT(INTERFACE, OBJECTPOINT, 62), - - /* Set the krb4/5 security level, this also enables krb4/5 awareness. This - * is a string, 'clear', 'safe', 'confidential' or 'private'. If the string - * is set but doesn't match one of these, 'private' will be used. */ - CINIT(KRBLEVEL, OBJECTPOINT, 63), - - /* Set if we should verify the peer in ssl handshake, set 1 to verify. */ - CINIT(SSL_VERIFYPEER, LONG, 64), - - /* The CApath or CAfile used to validate the peer certificate - this option is used only if SSL_VERIFYPEER is true */ - CINIT(CAINFO, OBJECTPOINT, 65), - - /* 66 = OBSOLETE */ - /* 67 = OBSOLETE */ - - /* Maximum number of http redirects to follow */ - CINIT(MAXREDIRS, LONG, 68), - - /* Pass a long set to 1 to get the date of the requested document (if - possible)! Pass a zero to shut it off. */ - CINIT(FILETIME, LONG, 69), - - /* This points to a linked list of telnet options */ - CINIT(TELNETOPTIONS, OBJECTPOINT, 70), - - /* Max amount of cached alive connections */ - CINIT(MAXCONNECTS, LONG, 71), - - /* What policy to use when closing connections when the cache is filled - up */ - CINIT(CLOSEPOLICY, LONG, 72), - - /* 73 = OBSOLETE */ - - /* Set to explicitly use a new connection for the upcoming transfer. - Do not use this unless you're absolutely sure of this, as it makes the - operation slower and is less friendly for the network. */ - CINIT(FRESH_CONNECT, LONG, 74), - - /* Set to explicitly forbid the upcoming transfer's connection to be re-used - when done. Do not use this unless you're absolutely sure of this, as it - makes the operation slower and is less friendly for the network. */ - CINIT(FORBID_REUSE, LONG, 75), - - /* Set to a file name that contains random data for libcurl to use to - seed the random engine when doing SSL connects. */ - CINIT(RANDOM_FILE, OBJECTPOINT, 76), - - /* Set to the Entropy Gathering Daemon socket pathname */ - CINIT(EGDSOCKET, OBJECTPOINT, 77), - - /* Time-out connect operations after this amount of seconds, if connects - are OK within this time, then fine... This only aborts the connect - phase. [Only works on unix-style/SIGALRM operating systems] */ - CINIT(CONNECTTIMEOUT, LONG, 78), - - /* Function that will be called to store headers (instead of fwrite). The - * parameters will use fwrite() syntax, make sure to follow them. */ - CINIT(HEADERFUNCTION, FUNCTIONPOINT, 79), - - /* Set this to force the HTTP request to get back to GET. Only really usable - if POST, PUT or a custom request have been used first. - */ - CINIT(HTTPGET, LONG, 80), - - /* Set if we should verify the Common name from the peer certificate in ssl - * handshake, set 1 to check existence, 2 to ensure that it matches the - * provided hostname. */ - CINIT(SSL_VERIFYHOST, LONG, 81), - - /* Specify which file name to write all known cookies in after completed - operation. Set file name to "-" (dash) to make it go to stdout. */ - CINIT(COOKIEJAR, OBJECTPOINT, 82), - - /* Specify which SSL ciphers to use */ - CINIT(SSL_CIPHER_LIST, OBJECTPOINT, 83), - - /* Specify which HTTP version to use! This must be set to one of the - CURL_HTTP_VERSION* enums set below. */ - CINIT(HTTP_VERSION, LONG, 84), - - /* Specifically switch on or off the FTP engine's use of the EPSV command. By - default, that one will always be attempted before the more traditional - PASV command. */ - CINIT(FTP_USE_EPSV, LONG, 85), - - /* type of the file keeping your SSL-certificate ("DER", "PEM", "ENG") */ - CINIT(SSLCERTTYPE, OBJECTPOINT, 86), - - /* name of the file keeping your private SSL-key */ - CINIT(SSLKEY, OBJECTPOINT, 87), - - /* type of the file keeping your private SSL-key ("DER", "PEM", "ENG") */ - CINIT(SSLKEYTYPE, OBJECTPOINT, 88), - - /* crypto engine for the SSL-sub system */ - CINIT(SSLENGINE, OBJECTPOINT, 89), - - /* set the crypto engine for the SSL-sub system as default - the param has no meaning... - */ - CINIT(SSLENGINE_DEFAULT, LONG, 90), - - /* Non-zero value means to use the global dns cache */ - CINIT(DNS_USE_GLOBAL_CACHE, LONG, 91), /* To become OBSOLETE soon */ - - /* DNS cache timeout */ - CINIT(DNS_CACHE_TIMEOUT, LONG, 92), - - /* send linked-list of pre-transfer QUOTE commands */ - CINIT(PREQUOTE, OBJECTPOINT, 93), - - /* set the debug function */ - CINIT(DEBUGFUNCTION, FUNCTIONPOINT, 94), - - /* set the data for the debug function */ - CINIT(DEBUGDATA, OBJECTPOINT, 95), - - /* mark this as start of a cookie session */ - CINIT(COOKIESESSION, LONG, 96), - - /* The CApath directory used to validate the peer certificate - this option is used only if SSL_VERIFYPEER is true */ - CINIT(CAPATH, OBJECTPOINT, 97), - - /* Instruct libcurl to use a smaller receive buffer */ - CINIT(BUFFERSIZE, LONG, 98), - - /* Instruct libcurl to not use any signal/alarm handlers, even when using - timeouts. This option is useful for multi-threaded applications. - See libcurl-the-guide for more background information. */ - CINIT(NOSIGNAL, LONG, 99), - - /* Provide a CURLShare for mutexing non-ts data */ - CINIT(SHARE, OBJECTPOINT, 100), - - /* indicates type of proxy. accepted values are CURLPROXY_HTTP (default), - CURLPROXY_SOCKS4, CURLPROXY_SOCKS4A and CURLPROXY_SOCKS5. */ - CINIT(PROXYTYPE, LONG, 101), - - /* Set the Accept-Encoding string. Use this to tell a server you would like - the response to be compressed. */ - CINIT(ENCODING, OBJECTPOINT, 102), - - /* Set pointer to private data */ - CINIT(PRIVATE, OBJECTPOINT, 103), - - /* Set aliases for HTTP 200 in the HTTP Response header */ - CINIT(HTTP200ALIASES, OBJECTPOINT, 104), - - /* Continue to send authentication (user+password) when following locations, - even when hostname changed. This can potentially send off the name - and password to whatever host the server decides. */ - CINIT(UNRESTRICTED_AUTH, LONG, 105), - - /* Specifically switch on or off the FTP engine's use of the EPRT command ( it - also disables the LPRT attempt). By default, those ones will always be - attempted before the good old traditional PORT command. */ - CINIT(FTP_USE_EPRT, LONG, 106), - - /* Set this to a bitmask value to enable the particular authentications - methods you like. Use this in combination with CURLOPT_USERPWD. - Note that setting multiple bits may cause extra network round-trips. */ - CINIT(HTTPAUTH, LONG, 107), - - /* Set the ssl context callback function, currently only for OpenSSL ssl_ctx - in second argument. The function must be matching the - curl_ssl_ctx_callback proto. */ - CINIT(SSL_CTX_FUNCTION, FUNCTIONPOINT, 108), - - /* Set the userdata for the ssl context callback function's third - argument */ - CINIT(SSL_CTX_DATA, OBJECTPOINT, 109), - - /* FTP Option that causes missing dirs to be created on the remote server. - In 7.19.4 we introduced the convenience enums for this option using the - CURLFTP_CREATE_DIR prefix. - */ - CINIT(FTP_CREATE_MISSING_DIRS, LONG, 110), - - /* Set this to a bitmask value to enable the particular authentications - methods you like. Use this in combination with CURLOPT_PROXYUSERPWD. - Note that setting multiple bits may cause extra network round-trips. */ - CINIT(PROXYAUTH, LONG, 111), - - /* FTP option that changes the timeout, in seconds, associated with - getting a response. This is different from transfer timeout time and - essentially places a demand on the FTP server to acknowledge commands - in a timely manner. */ - CINIT(FTP_RESPONSE_TIMEOUT, LONG, 112), -#define CURLOPT_SERVER_RESPONSE_TIMEOUT CURLOPT_FTP_RESPONSE_TIMEOUT - - /* Set this option to one of the CURL_IPRESOLVE_* defines (see below) to - tell libcurl to resolve names to those IP versions only. This only has - affect on systems with support for more than one, i.e IPv4 _and_ IPv6. */ - CINIT(IPRESOLVE, LONG, 113), - - /* Set this option to limit the size of a file that will be downloaded from - an HTTP or FTP server. - - Note there is also _LARGE version which adds large file support for - platforms which have larger off_t sizes. See MAXFILESIZE_LARGE below. */ - CINIT(MAXFILESIZE, LONG, 114), - - /* See the comment for INFILESIZE above, but in short, specifies - * the size of the file being uploaded. -1 means unknown. - */ - CINIT(INFILESIZE_LARGE, OFF_T, 115), - - /* Sets the continuation offset. There is also a LONG version of this; - * look above for RESUME_FROM. - */ - CINIT(RESUME_FROM_LARGE, OFF_T, 116), - - /* Sets the maximum size of data that will be downloaded from - * an HTTP or FTP server. See MAXFILESIZE above for the LONG version. - */ - CINIT(MAXFILESIZE_LARGE, OFF_T, 117), - - /* Set this option to the file name of your .netrc file you want libcurl - to parse (using the CURLOPT_NETRC option). If not set, libcurl will do - a poor attempt to find the user's home directory and check for a .netrc - file in there. */ - CINIT(NETRC_FILE, OBJECTPOINT, 118), - - /* Enable SSL/TLS for FTP, pick one of: - CURLFTPSSL_TRY - try using SSL, proceed anyway otherwise - CURLFTPSSL_CONTROL - SSL for the control connection or fail - CURLFTPSSL_ALL - SSL for all communication or fail - */ - CINIT(USE_SSL, LONG, 119), - - /* The _LARGE version of the standard POSTFIELDSIZE option */ - CINIT(POSTFIELDSIZE_LARGE, OFF_T, 120), - - /* Enable/disable the TCP Nagle algorithm */ - CINIT(TCP_NODELAY, LONG, 121), - - /* 122 OBSOLETE, used in 7.12.3. Gone in 7.13.0 */ - /* 123 OBSOLETE. Gone in 7.16.0 */ - /* 124 OBSOLETE, used in 7.12.3. Gone in 7.13.0 */ - /* 125 OBSOLETE, used in 7.12.3. Gone in 7.13.0 */ - /* 126 OBSOLETE, used in 7.12.3. Gone in 7.13.0 */ - /* 127 OBSOLETE. Gone in 7.16.0 */ - /* 128 OBSOLETE. Gone in 7.16.0 */ - - /* When FTP over SSL/TLS is selected (with CURLOPT_USE_SSL), this option - can be used to change libcurl's default action which is to first try - "AUTH SSL" and then "AUTH TLS" in this order, and proceed when a OK - response has been received. - - Available parameters are: - CURLFTPAUTH_DEFAULT - let libcurl decide - CURLFTPAUTH_SSL - try "AUTH SSL" first, then TLS - CURLFTPAUTH_TLS - try "AUTH TLS" first, then SSL - */ - CINIT(FTPSSLAUTH, LONG, 129), - - CINIT(IOCTLFUNCTION, FUNCTIONPOINT, 130), - CINIT(IOCTLDATA, OBJECTPOINT, 131), - - /* 132 OBSOLETE. Gone in 7.16.0 */ - /* 133 OBSOLETE. Gone in 7.16.0 */ - - /* zero terminated string for pass on to the FTP server when asked for - "account" info */ - CINIT(FTP_ACCOUNT, OBJECTPOINT, 134), - - /* feed cookies into cookie engine */ - CINIT(COOKIELIST, OBJECTPOINT, 135), - - /* ignore Content-Length */ - CINIT(IGNORE_CONTENT_LENGTH, LONG, 136), - - /* Set to non-zero to skip the IP address received in a 227 PASV FTP server - response. Typically used for FTP-SSL purposes but is not restricted to - that. libcurl will then instead use the same IP address it used for the - control connection. */ - CINIT(FTP_SKIP_PASV_IP, LONG, 137), - - /* Select "file method" to use when doing FTP, see the curl_ftpmethod - above. */ - CINIT(FTP_FILEMETHOD, LONG, 138), - - /* Local port number to bind the socket to */ - CINIT(LOCALPORT, LONG, 139), - - /* Number of ports to try, including the first one set with LOCALPORT. - Thus, setting it to 1 will make no additional attempts but the first. - */ - CINIT(LOCALPORTRANGE, LONG, 140), - - /* no transfer, set up connection and let application use the socket by - extracting it with CURLINFO_LASTSOCKET */ - CINIT(CONNECT_ONLY, LONG, 141), - - /* Function that will be called to convert from the - network encoding (instead of using the iconv calls in libcurl) */ - CINIT(CONV_FROM_NETWORK_FUNCTION, FUNCTIONPOINT, 142), - - /* Function that will be called to convert to the - network encoding (instead of using the iconv calls in libcurl) */ - CINIT(CONV_TO_NETWORK_FUNCTION, FUNCTIONPOINT, 143), - - /* Function that will be called to convert from UTF8 - (instead of using the iconv calls in libcurl) - Note that this is used only for SSL certificate processing */ - CINIT(CONV_FROM_UTF8_FUNCTION, FUNCTIONPOINT, 144), - - /* if the connection proceeds too quickly then need to slow it down */ - /* limit-rate: maximum number of bytes per second to send or receive */ - CINIT(MAX_SEND_SPEED_LARGE, OFF_T, 145), - CINIT(MAX_RECV_SPEED_LARGE, OFF_T, 146), - - /* Pointer to command string to send if USER/PASS fails. */ - CINIT(FTP_ALTERNATIVE_TO_USER, OBJECTPOINT, 147), - - /* callback function for setting socket options */ - CINIT(SOCKOPTFUNCTION, FUNCTIONPOINT, 148), - CINIT(SOCKOPTDATA, OBJECTPOINT, 149), - - /* set to 0 to disable session ID re-use for this transfer, default is - enabled (== 1) */ - CINIT(SSL_SESSIONID_CACHE, LONG, 150), - - /* allowed SSH authentication methods */ - CINIT(SSH_AUTH_TYPES, LONG, 151), - - /* Used by scp/sftp to do public/private key authentication */ - CINIT(SSH_PUBLIC_KEYFILE, OBJECTPOINT, 152), - CINIT(SSH_PRIVATE_KEYFILE, OBJECTPOINT, 153), - - /* Send CCC (Clear Command Channel) after authentication */ - CINIT(FTP_SSL_CCC, LONG, 154), - - /* Same as TIMEOUT and CONNECTTIMEOUT, but with ms resolution */ - CINIT(TIMEOUT_MS, LONG, 155), - CINIT(CONNECTTIMEOUT_MS, LONG, 156), - - /* set to zero to disable the libcurl's decoding and thus pass the raw body - data to the application even when it is encoded/compressed */ - CINIT(HTTP_TRANSFER_DECODING, LONG, 157), - CINIT(HTTP_CONTENT_DECODING, LONG, 158), - - /* Permission used when creating new files and directories on the remote - server for protocols that support it, SFTP/SCP/FILE */ - CINIT(NEW_FILE_PERMS, LONG, 159), - CINIT(NEW_DIRECTORY_PERMS, LONG, 160), - - /* Set the behaviour of POST when redirecting. Values must be set to one - of CURL_REDIR* defines below. This used to be called CURLOPT_POST301 */ - CINIT(POSTREDIR, LONG, 161), - - /* used by scp/sftp to verify the host's public key */ - CINIT(SSH_HOST_PUBLIC_KEY_MD5, OBJECTPOINT, 162), - - /* Callback function for opening socket (instead of socket(2)). Optionally, - callback is able change the address or refuse to connect returning - CURL_SOCKET_BAD. The callback should have type - curl_opensocket_callback */ - CINIT(OPENSOCKETFUNCTION, FUNCTIONPOINT, 163), - CINIT(OPENSOCKETDATA, OBJECTPOINT, 164), - - /* POST volatile input fields. */ - CINIT(COPYPOSTFIELDS, OBJECTPOINT, 165), - - /* set transfer mode (;type=<a|i>) when doing FTP via an HTTP proxy */ - CINIT(PROXY_TRANSFER_MODE, LONG, 166), - - /* Callback function for seeking in the input stream */ - CINIT(SEEKFUNCTION, FUNCTIONPOINT, 167), - CINIT(SEEKDATA, OBJECTPOINT, 168), - - /* CRL file */ - CINIT(CRLFILE, OBJECTPOINT, 169), - - /* Issuer certificate */ - CINIT(ISSUERCERT, OBJECTPOINT, 170), - - /* (IPv6) Address scope */ - CINIT(ADDRESS_SCOPE, LONG, 171), - - /* Collect certificate chain info and allow it to get retrievable with - CURLINFO_CERTINFO after the transfer is complete. (Unfortunately) only - working with OpenSSL-powered builds. */ - CINIT(CERTINFO, LONG, 172), - - /* "name" and "pwd" to use when fetching. */ - CINIT(USERNAME, OBJECTPOINT, 173), - CINIT(PASSWORD, OBJECTPOINT, 174), - - /* "name" and "pwd" to use with Proxy when fetching. */ - CINIT(PROXYUSERNAME, OBJECTPOINT, 175), - CINIT(PROXYPASSWORD, OBJECTPOINT, 176), - - /* Comma separated list of hostnames defining no-proxy zones. These should - match both hostnames directly, and hostnames within a domain. For - example, local.com will match local.com and www.local.com, but NOT - notlocal.com or www.notlocal.com. For compatibility with other - implementations of this, .local.com will be considered to be the same as - local.com. A single * is the only valid wildcard, and effectively - disables the use of proxy. */ - CINIT(NOPROXY, OBJECTPOINT, 177), - - /* block size for TFTP transfers */ - CINIT(TFTP_BLKSIZE, LONG, 178), - - /* Socks Service */ - CINIT(SOCKS5_GSSAPI_SERVICE, OBJECTPOINT, 179), - - /* Socks Service */ - CINIT(SOCKS5_GSSAPI_NEC, LONG, 180), - - /* set the bitmask for the protocols that are allowed to be used for the - transfer, which thus helps the app which takes URLs from users or other - external inputs and want to restrict what protocol(s) to deal - with. Defaults to CURLPROTO_ALL. */ - CINIT(PROTOCOLS, LONG, 181), - - /* set the bitmask for the protocols that libcurl is allowed to follow to, - as a subset of the CURLOPT_PROTOCOLS ones. That means the protocol needs - to be set in both bitmasks to be allowed to get redirected to. Defaults - to all protocols except FILE and SCP. */ - CINIT(REDIR_PROTOCOLS, LONG, 182), - - /* set the SSH knownhost file name to use */ - CINIT(SSH_KNOWNHOSTS, OBJECTPOINT, 183), - - /* set the SSH host key callback, must point to a curl_sshkeycallback - function */ - CINIT(SSH_KEYFUNCTION, FUNCTIONPOINT, 184), - - /* set the SSH host key callback custom pointer */ - CINIT(SSH_KEYDATA, OBJECTPOINT, 185), - - /* set the SMTP mail originator */ - CINIT(MAIL_FROM, OBJECTPOINT, 186), - - /* set the SMTP mail receiver(s) */ - CINIT(MAIL_RCPT, OBJECTPOINT, 187), - - /* FTP: send PRET before PASV */ - CINIT(FTP_USE_PRET, LONG, 188), - - /* RTSP request method (OPTIONS, SETUP, PLAY, etc...) */ - CINIT(RTSP_REQUEST, LONG, 189), - - /* The RTSP session identifier */ - CINIT(RTSP_SESSION_ID, OBJECTPOINT, 190), - - /* The RTSP stream URI */ - CINIT(RTSP_STREAM_URI, OBJECTPOINT, 191), - - /* The Transport: header to use in RTSP requests */ - CINIT(RTSP_TRANSPORT, OBJECTPOINT, 192), - - /* Manually initialize the client RTSP CSeq for this handle */ - CINIT(RTSP_CLIENT_CSEQ, LONG, 193), - - /* Manually initialize the server RTSP CSeq for this handle */ - CINIT(RTSP_SERVER_CSEQ, LONG, 194), - - /* The stream to pass to INTERLEAVEFUNCTION. */ - CINIT(INTERLEAVEDATA, OBJECTPOINT, 195), - - /* Let the application define a custom write method for RTP data */ - CINIT(INTERLEAVEFUNCTION, FUNCTIONPOINT, 196), - - CURLOPT_LASTENTRY /* the last unused */ -} CURLoption; - -#ifndef CURL_NO_OLDIES /* define this to test if your app builds with all - the obsolete stuff removed! */ - -/* Backwards compatibility with older names */ -/* These are scheduled to disappear by 2011 */ - -/* This was added in version 7.19.1 */ -#define CURLOPT_POST301 CURLOPT_POSTREDIR - -/* These are scheduled to disappear by 2009 */ - -/* The following were added in 7.17.0 */ -#define CURLOPT_SSLKEYPASSWD CURLOPT_KEYPASSWD -#define CURLOPT_FTPAPPEND CURLOPT_APPEND -#define CURLOPT_FTPLISTONLY CURLOPT_DIRLISTONLY -#define CURLOPT_FTP_SSL CURLOPT_USE_SSL - -/* The following were added earlier */ - -#define CURLOPT_SSLCERTPASSWD CURLOPT_KEYPASSWD -#define CURLOPT_KRB4LEVEL CURLOPT_KRBLEVEL - -#else -/* This is set if CURL_NO_OLDIES is defined at compile-time */ -#undef CURLOPT_DNS_USE_GLOBAL_CACHE /* soon obsolete */ -#endif - - - /* Below here follows defines for the CURLOPT_IPRESOLVE option. If a host - name resolves addresses using more than one IP protocol version, this - option might be handy to force libcurl to use a specific IP version. */ -#define CURL_IPRESOLVE_WHATEVER 0 /* default, resolves addresses to all IP - versions that your system allows */ -#define CURL_IPRESOLVE_V4 1 /* resolve to ipv4 addresses */ -#define CURL_IPRESOLVE_V6 2 /* resolve to ipv6 addresses */ - - /* three convenient "aliases" that follow the name scheme better */ -#define CURLOPT_WRITEDATA CURLOPT_FILE -#define CURLOPT_READDATA CURLOPT_INFILE -#define CURLOPT_HEADERDATA CURLOPT_WRITEHEADER -#define CURLOPT_RTSPHEADER CURLOPT_HTTPHEADER - - /* These enums are for use with the CURLOPT_HTTP_VERSION option. */ -enum { - CURL_HTTP_VERSION_NONE, /* setting this means we don't care, and that we'd - like the library to choose the best possible - for us! */ - CURL_HTTP_VERSION_1_0, /* please use HTTP 1.0 in the request */ - CURL_HTTP_VERSION_1_1, /* please use HTTP 1.1 in the request */ - - CURL_HTTP_VERSION_LAST /* *ILLEGAL* http version */ -}; - -/* - * Public API enums for RTSP requests - */ -enum { - CURL_RTSPREQ_NONE, /* first in list */ - CURL_RTSPREQ_OPTIONS, - CURL_RTSPREQ_DESCRIBE, - CURL_RTSPREQ_ANNOUNCE, - CURL_RTSPREQ_SETUP, - CURL_RTSPREQ_PLAY, - CURL_RTSPREQ_PAUSE, - CURL_RTSPREQ_TEARDOWN, - CURL_RTSPREQ_GET_PARAMETER, - CURL_RTSPREQ_SET_PARAMETER, - CURL_RTSPREQ_RECORD, - CURL_RTSPREQ_RECEIVE, - CURL_RTSPREQ_LAST /* last in list */ -}; - - /* These enums are for use with the CURLOPT_NETRC option. */ -enum CURL_NETRC_OPTION { - CURL_NETRC_IGNORED, /* The .netrc will never be read. - * This is the default. */ - CURL_NETRC_OPTIONAL, /* A user:password in the URL will be preferred - * to one in the .netrc. */ - CURL_NETRC_REQUIRED, /* A user:password in the URL will be ignored. - * Unless one is set programmatically, the .netrc - * will be queried. */ - CURL_NETRC_LAST -}; - -enum { - CURL_SSLVERSION_DEFAULT, - CURL_SSLVERSION_TLSv1, - CURL_SSLVERSION_SSLv2, - CURL_SSLVERSION_SSLv3, - - CURL_SSLVERSION_LAST /* never use, keep last */ -}; - -/* symbols to use with CURLOPT_POSTREDIR. - CURL_REDIR_POST_301 and CURL_REDIR_POST_302 can be bitwise ORed so that - CURL_REDIR_POST_301 | CURL_REDIR_POST_302 == CURL_REDIR_POST_ALL */ - -#define CURL_REDIR_GET_ALL 0 -#define CURL_REDIR_POST_301 1 -#define CURL_REDIR_POST_302 2 -#define CURL_REDIR_POST_ALL (CURL_REDIR_POST_301|CURL_REDIR_POST_302) - -typedef enum { - CURL_TIMECOND_NONE, - - CURL_TIMECOND_IFMODSINCE, - CURL_TIMECOND_IFUNMODSINCE, - CURL_TIMECOND_LASTMOD, - - CURL_TIMECOND_LAST -} curl_TimeCond; - - -/* curl_strequal() and curl_strnequal() are subject for removal in a future - libcurl, see lib/README.curlx for details */ -CURL_EXTERN int (curl_strequal)(const char *s1, const char *s2); -CURL_EXTERN int (curl_strnequal)(const char *s1, const char *s2, size_t n); - -/* name is uppercase CURLFORM_<name> */ -#ifdef CFINIT -#undef CFINIT -#endif - -#ifdef CURL_ISOCPP -#define CFINIT(name) CURLFORM_ ## name -#else -/* The macro "##" is ISO C, we assume pre-ISO C doesn't support it. */ -#define CFINIT(name) CURLFORM_/**/name -#endif - -typedef enum { - CFINIT(NOTHING), /********* the first one is unused ************/ - - /* */ - CFINIT(COPYNAME), - CFINIT(PTRNAME), - CFINIT(NAMELENGTH), - CFINIT(COPYCONTENTS), - CFINIT(PTRCONTENTS), - CFINIT(CONTENTSLENGTH), - CFINIT(FILECONTENT), - CFINIT(ARRAY), - CFINIT(OBSOLETE), - CFINIT(FILE), - - CFINIT(BUFFER), - CFINIT(BUFFERPTR), - CFINIT(BUFFERLENGTH), - - CFINIT(CONTENTTYPE), - CFINIT(CONTENTHEADER), - CFINIT(FILENAME), - CFINIT(END), - CFINIT(OBSOLETE2), - - CFINIT(STREAM), - - CURLFORM_LASTENTRY /* the last unused */ -} CURLformoption; - -#undef CFINIT /* done */ - -/* structure to be used as parameter for CURLFORM_ARRAY */ -struct curl_forms { - CURLformoption option; - const char *value; -}; - -/* use this for multipart formpost building */ -/* Returns code for curl_formadd() - * - * Returns: - * CURL_FORMADD_OK on success - * CURL_FORMADD_MEMORY if the FormInfo allocation fails - * CURL_FORMADD_OPTION_TWICE if one option is given twice for one Form - * CURL_FORMADD_NULL if a null pointer was given for a char - * CURL_FORMADD_MEMORY if the allocation of a FormInfo struct failed - * CURL_FORMADD_UNKNOWN_OPTION if an unknown option was used - * CURL_FORMADD_INCOMPLETE if the some FormInfo is not complete (or error) - * CURL_FORMADD_MEMORY if a curl_httppost struct cannot be allocated - * CURL_FORMADD_MEMORY if some allocation for string copying failed. - * CURL_FORMADD_ILLEGAL_ARRAY if an illegal option is used in an array - * - ***************************************************************************/ -typedef enum { - CURL_FORMADD_OK, /* first, no error */ - - CURL_FORMADD_MEMORY, - CURL_FORMADD_OPTION_TWICE, - CURL_FORMADD_NULL, - CURL_FORMADD_UNKNOWN_OPTION, - CURL_FORMADD_INCOMPLETE, - CURL_FORMADD_ILLEGAL_ARRAY, - CURL_FORMADD_DISABLED, /* libcurl was built with this disabled */ - - CURL_FORMADD_LAST /* last */ -} CURLFORMcode; - -/* - * NAME curl_formadd() - * - * DESCRIPTION - * - * Pretty advanced function for building multi-part formposts. Each invoke - * adds one part that together construct a full post. Then use - * CURLOPT_HTTPPOST to send it off to libcurl. - */ -CURL_EXTERN CURLFORMcode curl_formadd(struct curl_httppost **httppost, - struct curl_httppost **last_post, - ...); - -/* - * callback function for curl_formget() - * The void *arg pointer will be the one passed as second argument to - * curl_formget(). - * The character buffer passed to it must not be freed. - * Should return the buffer length passed to it as the argument "len" on - * success. - */ -typedef size_t (*curl_formget_callback)(void *arg, const char *buf, size_t len); - -/* - * NAME curl_formget() - * - * DESCRIPTION - * - * Serialize a curl_httppost struct built with curl_formadd(). - * Accepts a void pointer as second argument which will be passed to - * the curl_formget_callback function. - * Returns 0 on success. - */ -CURL_EXTERN int curl_formget(struct curl_httppost *form, void *arg, - curl_formget_callback append); -/* - * NAME curl_formfree() - * - * DESCRIPTION - * - * Free a multipart formpost previously built with curl_formadd(). - */ -CURL_EXTERN void curl_formfree(struct curl_httppost *form); - -/* - * NAME curl_getenv() - * - * DESCRIPTION - * - * Returns a malloc()'ed string that MUST be curl_free()ed after usage is - * complete. DEPRECATED - see lib/README.curlx - */ -CURL_EXTERN char *curl_getenv(const char *variable); - -/* - * NAME curl_version() - * - * DESCRIPTION - * - * Returns a static ascii string of the libcurl version. - */ -CURL_EXTERN char *curl_version(void); - -/* - * NAME curl_easy_escape() - * - * DESCRIPTION - * - * Escapes URL strings (converts all letters consider illegal in URLs to their - * %XX versions). This function returns a new allocated string or NULL if an - * error occurred. - */ -CURL_EXTERN char *curl_easy_escape(CURL *handle, - const char *string, - int length); - -/* the previous version: */ -CURL_EXTERN char *curl_escape(const char *string, - int length); - - -/* - * NAME curl_easy_unescape() - * - * DESCRIPTION - * - * Unescapes URL encoding in strings (converts all %XX codes to their 8bit - * versions). This function returns a new allocated string or NULL if an error - * occurred. - * Conversion Note: On non-ASCII platforms the ASCII %XX codes are - * converted into the host encoding. - */ -CURL_EXTERN char *curl_easy_unescape(CURL *handle, - const char *string, - int length, - int *outlength); - -/* the previous version */ -CURL_EXTERN char *curl_unescape(const char *string, - int length); - -/* - * NAME curl_free() - * - * DESCRIPTION - * - * Provided for de-allocation in the same translation unit that did the - * allocation. Added in libcurl 7.10 - */ -CURL_EXTERN void curl_free(void *p); - -/* - * NAME curl_global_init() - * - * DESCRIPTION - * - * curl_global_init() should be invoked exactly once for each application that - * uses libcurl and before any call of other libcurl functions. - * - * This function is not thread-safe! - */ -CURL_EXTERN CURLcode curl_global_init(long flags); - -/* - * NAME curl_global_init_mem() - * - * DESCRIPTION - * - * curl_global_init() or curl_global_init_mem() should be invoked exactly once - * for each application that uses libcurl. This function can be used to - * initialize libcurl and set user defined memory management callback - * functions. Users can implement memory management routines to check for - * memory leaks, check for mis-use of the curl library etc. User registered - * callback routines with be invoked by this library instead of the system - * memory management routines like malloc, free etc. - */ -CURL_EXTERN CURLcode curl_global_init_mem(long flags, - curl_malloc_callback m, - curl_free_callback f, - curl_realloc_callback r, - curl_strdup_callback s, - curl_calloc_callback c); - -/* - * NAME curl_global_cleanup() - * - * DESCRIPTION - * - * curl_global_cleanup() should be invoked exactly once for each application - * that uses libcurl - */ -CURL_EXTERN void curl_global_cleanup(void); - -/* linked-list structure for the CURLOPT_QUOTE option (and other) */ -struct curl_slist { - char *data; - struct curl_slist *next; -}; - -/* - * NAME curl_slist_append() - * - * DESCRIPTION - * - * Appends a string to a linked list. If no list exists, it will be created - * first. Returns the new list, after appending. - */ -CURL_EXTERN struct curl_slist *curl_slist_append(struct curl_slist *, - const char *); - -/* - * NAME curl_slist_free_all() - * - * DESCRIPTION - * - * free a previously built curl_slist. - */ -CURL_EXTERN void curl_slist_free_all(struct curl_slist *); - -/* - * NAME curl_getdate() - * - * DESCRIPTION - * - * Returns the time, in seconds since 1 Jan 1970 of the time string given in - * the first argument. The time argument in the second parameter is unused - * and should be set to NULL. - */ -CURL_EXTERN time_t curl_getdate(const char *p, const time_t *unused); - -/* info about the certificate chain, only for OpenSSL builds. Asked - for with CURLOPT_CERTINFO / CURLINFO_CERTINFO */ -struct curl_certinfo { - int num_of_certs; /* number of certificates with information */ - struct curl_slist **certinfo; /* for each index in this array, there's a - linked list with textual information in the - format "name: value" */ -}; - -#define CURLINFO_STRING 0x100000 -#define CURLINFO_LONG 0x200000 -#define CURLINFO_DOUBLE 0x300000 -#define CURLINFO_SLIST 0x400000 -#define CURLINFO_MASK 0x0fffff -#define CURLINFO_TYPEMASK 0xf00000 - -typedef enum { - CURLINFO_NONE, /* first, never use this */ - CURLINFO_EFFECTIVE_URL = CURLINFO_STRING + 1, - CURLINFO_RESPONSE_CODE = CURLINFO_LONG + 2, - CURLINFO_TOTAL_TIME = CURLINFO_DOUBLE + 3, - CURLINFO_NAMELOOKUP_TIME = CURLINFO_DOUBLE + 4, - CURLINFO_CONNECT_TIME = CURLINFO_DOUBLE + 5, - CURLINFO_PRETRANSFER_TIME = CURLINFO_DOUBLE + 6, - CURLINFO_SIZE_UPLOAD = CURLINFO_DOUBLE + 7, - CURLINFO_SIZE_DOWNLOAD = CURLINFO_DOUBLE + 8, - CURLINFO_SPEED_DOWNLOAD = CURLINFO_DOUBLE + 9, - CURLINFO_SPEED_UPLOAD = CURLINFO_DOUBLE + 10, - CURLINFO_HEADER_SIZE = CURLINFO_LONG + 11, - CURLINFO_REQUEST_SIZE = CURLINFO_LONG + 12, - CURLINFO_SSL_VERIFYRESULT = CURLINFO_LONG + 13, - CURLINFO_FILETIME = CURLINFO_LONG + 14, - CURLINFO_CONTENT_LENGTH_DOWNLOAD = CURLINFO_DOUBLE + 15, - CURLINFO_CONTENT_LENGTH_UPLOAD = CURLINFO_DOUBLE + 16, - CURLINFO_STARTTRANSFER_TIME = CURLINFO_DOUBLE + 17, - CURLINFO_CONTENT_TYPE = CURLINFO_STRING + 18, - CURLINFO_REDIRECT_TIME = CURLINFO_DOUBLE + 19, - CURLINFO_REDIRECT_COUNT = CURLINFO_LONG + 20, - CURLINFO_PRIVATE = CURLINFO_STRING + 21, - CURLINFO_HTTP_CONNECTCODE = CURLINFO_LONG + 22, - CURLINFO_HTTPAUTH_AVAIL = CURLINFO_LONG + 23, - CURLINFO_PROXYAUTH_AVAIL = CURLINFO_LONG + 24, - CURLINFO_OS_ERRNO = CURLINFO_LONG + 25, - CURLINFO_NUM_CONNECTS = CURLINFO_LONG + 26, - CURLINFO_SSL_ENGINES = CURLINFO_SLIST + 27, - CURLINFO_COOKIELIST = CURLINFO_SLIST + 28, - CURLINFO_LASTSOCKET = CURLINFO_LONG + 29, - CURLINFO_FTP_ENTRY_PATH = CURLINFO_STRING + 30, - CURLINFO_REDIRECT_URL = CURLINFO_STRING + 31, - CURLINFO_PRIMARY_IP = CURLINFO_STRING + 32, - CURLINFO_APPCONNECT_TIME = CURLINFO_DOUBLE + 33, - CURLINFO_CERTINFO = CURLINFO_SLIST + 34, - CURLINFO_CONDITION_UNMET = CURLINFO_LONG + 35, - CURLINFO_RTSP_SESSION_ID = CURLINFO_STRING + 36, - CURLINFO_RTSP_CLIENT_CSEQ = CURLINFO_LONG + 37, - CURLINFO_RTSP_SERVER_CSEQ = CURLINFO_LONG + 38, - CURLINFO_RTSP_CSEQ_RECV = CURLINFO_LONG + 39, - /* Fill in new entries below here! */ - - CURLINFO_LASTONE = 39 -} CURLINFO; - -/* CURLINFO_RESPONSE_CODE is the new name for the option previously known as - CURLINFO_HTTP_CODE */ -#define CURLINFO_HTTP_CODE CURLINFO_RESPONSE_CODE - -typedef enum { - CURLCLOSEPOLICY_NONE, /* first, never use this */ - - CURLCLOSEPOLICY_OLDEST, - CURLCLOSEPOLICY_LEAST_RECENTLY_USED, - CURLCLOSEPOLICY_LEAST_TRAFFIC, - CURLCLOSEPOLICY_SLOWEST, - CURLCLOSEPOLICY_CALLBACK, - - CURLCLOSEPOLICY_LAST /* last, never use this */ -} curl_closepolicy; - -#define CURL_GLOBAL_SSL (1<<0) -#define CURL_GLOBAL_WIN32 (1<<1) -#define CURL_GLOBAL_ALL (CURL_GLOBAL_SSL|CURL_GLOBAL_WIN32) -#define CURL_GLOBAL_NOTHING 0 -#define CURL_GLOBAL_DEFAULT CURL_GLOBAL_ALL - - -/***************************************************************************** - * Setup defines, protos etc for the sharing stuff. - */ - -/* Different data locks for a single share */ -typedef enum { - CURL_LOCK_DATA_NONE = 0, - /* CURL_LOCK_DATA_SHARE is used internally to say that - * the locking is just made to change the internal state of the share - * itself. - */ - CURL_LOCK_DATA_SHARE, - CURL_LOCK_DATA_COOKIE, - CURL_LOCK_DATA_DNS, - CURL_LOCK_DATA_SSL_SESSION, - CURL_LOCK_DATA_CONNECT, - CURL_LOCK_DATA_LAST -} curl_lock_data; - -/* Different lock access types */ -typedef enum { - CURL_LOCK_ACCESS_NONE = 0, /* unspecified action */ - CURL_LOCK_ACCESS_SHARED = 1, /* for read perhaps */ - CURL_LOCK_ACCESS_SINGLE = 2, /* for write perhaps */ - CURL_LOCK_ACCESS_LAST /* never use */ -} curl_lock_access; - -typedef void (*curl_lock_function)(CURL *handle, - curl_lock_data data, - curl_lock_access locktype, - void *userptr); -typedef void (*curl_unlock_function)(CURL *handle, - curl_lock_data data, - void *userptr); - -typedef void CURLSH; - -typedef enum { - CURLSHE_OK, /* all is fine */ - CURLSHE_BAD_OPTION, /* 1 */ - CURLSHE_IN_USE, /* 2 */ - CURLSHE_INVALID, /* 3 */ - CURLSHE_NOMEM, /* out of memory */ - CURLSHE_LAST /* never use */ -} CURLSHcode; - -typedef enum { - CURLSHOPT_NONE, /* don't use */ - CURLSHOPT_SHARE, /* specify a data type to share */ - CURLSHOPT_UNSHARE, /* specify which data type to stop sharing */ - CURLSHOPT_LOCKFUNC, /* pass in a 'curl_lock_function' pointer */ - CURLSHOPT_UNLOCKFUNC, /* pass in a 'curl_unlock_function' pointer */ - CURLSHOPT_USERDATA, /* pass in a user data pointer used in the lock/unlock - callback functions */ - CURLSHOPT_LAST /* never use */ -} CURLSHoption; - -CURL_EXTERN CURLSH *curl_share_init(void); -CURL_EXTERN CURLSHcode curl_share_setopt(CURLSH *, CURLSHoption option, ...); -CURL_EXTERN CURLSHcode curl_share_cleanup(CURLSH *); - -/**************************************************************************** - * Structures for querying information about the curl library at runtime. - */ - -typedef enum { - CURLVERSION_FIRST, - CURLVERSION_SECOND, - CURLVERSION_THIRD, - CURLVERSION_FOURTH, - CURLVERSION_LAST /* never actually use this */ -} CURLversion; - -/* The 'CURLVERSION_NOW' is the symbolic name meant to be used by - basically all programs ever that want to get version information. It is - meant to be a built-in version number for what kind of struct the caller - expects. If the struct ever changes, we redefine the NOW to another enum - from above. */ -#define CURLVERSION_NOW CURLVERSION_FOURTH - -typedef struct { - CURLversion age; /* age of the returned struct */ - const char *version; /* LIBCURL_VERSION */ - unsigned int version_num; /* LIBCURL_VERSION_NUM */ - const char *host; /* OS/host/cpu/machine when configured */ - int features; /* bitmask, see defines below */ - const char *ssl_version; /* human readable string */ - long ssl_version_num; /* not used anymore, always 0 */ - const char *libz_version; /* human readable string */ - /* protocols is terminated by an entry with a NULL protoname */ - const char * const *protocols; - - /* The fields below this were added in CURLVERSION_SECOND */ - const char *ares; - int ares_num; - - /* This field was added in CURLVERSION_THIRD */ - const char *libidn; - - /* These field were added in CURLVERSION_FOURTH */ - - /* Same as '_libiconv_version' if built with HAVE_ICONV */ - int iconv_ver_num; - - const char *libssh_version; /* human readable string */ - -} curl_version_info_data; - -#define CURL_VERSION_IPV6 (1<<0) /* IPv6-enabled */ -#define CURL_VERSION_KERBEROS4 (1<<1) /* kerberos auth is supported */ -#define CURL_VERSION_SSL (1<<2) /* SSL options are present */ -#define CURL_VERSION_LIBZ (1<<3) /* libz features are present */ -#define CURL_VERSION_NTLM (1<<4) /* NTLM auth is supported */ -#define CURL_VERSION_GSSNEGOTIATE (1<<5) /* Negotiate auth support */ -#define CURL_VERSION_DEBUG (1<<6) /* built with debug capabilities */ -#define CURL_VERSION_ASYNCHDNS (1<<7) /* asynchronous dns resolves */ -#define CURL_VERSION_SPNEGO (1<<8) /* SPNEGO auth */ -#define CURL_VERSION_LARGEFILE (1<<9) /* supports files bigger than 2GB */ -#define CURL_VERSION_IDN (1<<10) /* International Domain Names support */ -#define CURL_VERSION_SSPI (1<<11) /* SSPI is supported */ -#define CURL_VERSION_CONV (1<<12) /* character conversions supported */ -#define CURL_VERSION_CURLDEBUG (1<<13) /* debug memory tracking supported */ - -/* - * NAME curl_version_info() - * - * DESCRIPTION - * - * This function returns a pointer to a static copy of the version info - * struct. See above. - */ -CURL_EXTERN curl_version_info_data *curl_version_info(CURLversion); - -/* - * NAME curl_easy_strerror() - * - * DESCRIPTION - * - * The curl_easy_strerror function may be used to turn a CURLcode value - * into the equivalent human readable error string. This is useful - * for printing meaningful error messages. - */ -CURL_EXTERN const char *curl_easy_strerror(CURLcode); - -/* - * NAME curl_share_strerror() - * - * DESCRIPTION - * - * The curl_share_strerror function may be used to turn a CURLSHcode value - * into the equivalent human readable error string. This is useful - * for printing meaningful error messages. - */ -CURL_EXTERN const char *curl_share_strerror(CURLSHcode); - -/* - * NAME curl_easy_pause() - * - * DESCRIPTION - * - * The curl_easy_pause function pauses or unpauses transfers. Select the new - * state by setting the bitmask, use the convenience defines below. - * - */ -CURL_EXTERN CURLcode curl_easy_pause(CURL *handle, int bitmask); - -#define CURLPAUSE_RECV (1<<0) -#define CURLPAUSE_RECV_CONT (0) - -#define CURLPAUSE_SEND (1<<2) -#define CURLPAUSE_SEND_CONT (0) - -#define CURLPAUSE_ALL (CURLPAUSE_RECV|CURLPAUSE_SEND) -#define CURLPAUSE_CONT (CURLPAUSE_RECV_CONT|CURLPAUSE_SEND_CONT) - -#ifdef __cplusplus -} -#endif - -/* unfortunately, the easy.h and multi.h include files need options and info - stuff before they can be included! */ -#include "easy.h" /* nothing in curl is fun without the easy stuff */ -#include "multi.h" - -/* the typechecker doesn't work in C++ (yet) */ -#if defined(__GNUC__) && defined(__GNUC_MINOR__) && \ - ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) && \ - !defined(__cplusplus) && !defined(CURL_DISABLE_TYPECHECK) -#include "typecheck-gcc.h" -#else -#if defined(__STDC__) && (__STDC__ >= 1) -/* This preprocessor magic that replaces a call with the exact same call is - only done to make sure application authors pass exactly three arguments - to these functions. */ -#define curl_easy_setopt(handle,opt,param) curl_easy_setopt(handle,opt,param) -#define curl_easy_getinfo(handle,info,arg) curl_easy_getinfo(handle,info,arg) -#define curl_share_setopt(share,opt,param) curl_share_setopt(share,opt,param) -#define curl_multi_setopt(handle,opt,param) curl_multi_setopt(handle,opt,param) -#endif /* __STDC__ >= 1 */ -#endif /* gcc >= 4.3 && !__cplusplus */ - -#endif /* __CURL_CURL_H */ diff --git a/src/utils/curl/curlbuild.h b/src/utils/curl/curlbuild.h deleted file mode 100644 index d0b32acbec47fb148136d2368b0d0d783b5212e7..0000000000000000000000000000000000000000 --- a/src/utils/curl/curlbuild.h +++ /dev/null @@ -1,583 +0,0 @@ -#ifndef __CURL_CURLBUILD_H -#define __CURL_CURLBUILD_H -/*************************************************************************** - * _ _ ____ _ - * Project ___| | | | _ \| | - * / __| | | | |_) | | - * | (__| |_| | _ <| |___ - * \___|\___/|_| \_\_____| - * - * Copyright (C) 1998 - 2010, Daniel Stenberg, <daniel@haxx.se>, et al. - * - * This software is licensed as described in the file COPYING, which - * you should have received as part of this distribution. The terms - * are also available at http://curl.haxx.se/docs/copyright.html. - * - * You may opt to use, copy, modify, merge, publish, distribute and/or sell - * copies of the Software, and permit persons to whom the Software is - * furnished to do so, under the terms of the COPYING file. - * - * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY - * KIND, either express or implied. - * - ***************************************************************************/ - -/* ================================================================ */ -/* NOTES FOR CONFIGURE CAPABLE SYSTEMS */ -/* ================================================================ */ - -/* - * NOTE 1: - * ------- - * - * See file include/curl/curlbuild.h.in, run configure, and forget - * that this file exists it is only used for non-configure systems. - * But you can keep reading if you want ;-) - * - */ - -/* ================================================================ */ -/* NOTES FOR NON-CONFIGURE SYSTEMS */ -/* ================================================================ */ - -/* - * NOTE 1: - * ------- - * - * Nothing in this file is intended to be modified or adjusted by the - * curl library user nor by the curl library builder. - * - * If you think that something actually needs to be changed, adjusted - * or fixed in this file, then, report it on the libcurl development - * mailing list: http://cool.haxx.se/mailman/listinfo/curl-library/ - * - * Try to keep one section per platform, compiler and architecture, - * otherwise, if an existing section is reused for a different one and - * later on the original is adjusted, probably the piggybacking one can - * be adversely changed. - * - * In order to differentiate between platforms/compilers/architectures - * use only compiler built in predefined preprocessor symbols. - * - * This header file shall only export symbols which are 'curl' or 'CURL' - * prefixed, otherwise public name space would be polluted. - * - * NOTE 2: - * ------- - * - * For any given platform/compiler curl_off_t must be typedef'ed to a - * 64-bit wide signed integral data type. The width of this data type - * must remain constant and independent of any possible large file - * support settings. - * - * As an exception to the above, curl_off_t shall be typedef'ed to a - * 32-bit wide signed integral data type if there is no 64-bit type. - * - * As a general rule, curl_off_t shall not be mapped to off_t. This - * rule shall only be violated if off_t is the only 64-bit data type - * available and the size of off_t is independent of large file support - * settings. Keep your build on the safe side avoiding an off_t gating. - * If you have a 64-bit off_t then take for sure that another 64-bit - * data type exists, dig deeper and you will find it. - * - * NOTE 3: - * ------- - * - * Right now you might be staring at file include/curl/curlbuild.h.dist or - * at file include/curl/curlbuild.h, this is due to the following reason: - * file include/curl/curlbuild.h.dist is renamed to include/curl/curlbuild.h - * when the libcurl source code distribution archive file is created. - * - * File include/curl/curlbuild.h.dist is not included in the distribution - * archive. File include/curl/curlbuild.h is not present in the git tree. - * - * The distributed include/curl/curlbuild.h file is only intended to be used - * on systems which can not run the also distributed configure script. - * - * On systems capable of running the configure script, the configure process - * will overwrite the distributed include/curl/curlbuild.h file with one that - * is suitable and specific to the library being configured and built, which - * is generated from the include/curl/curlbuild.h.in template file. - * - * If you check out from git on a non-configure platform, you must run the - * appropriate buildconf* script to set up curlbuild.h and other local files. - * - */ - -/* ================================================================ */ -/* DEFINITION OF THESE SYMBOLS SHALL NOT TAKE PLACE ANYWHERE ELSE */ -/* ================================================================ */ - -#ifdef CURL_SIZEOF_LONG -# error "CURL_SIZEOF_LONG shall not be defined except in curlbuild.h" - Error Compilation_aborted_CURL_SIZEOF_LONG_already_defined -#endif - -#ifdef CURL_TYPEOF_CURL_SOCKLEN_T -# error "CURL_TYPEOF_CURL_SOCKLEN_T shall not be defined except in curlbuild.h" - Error Compilation_aborted_CURL_TYPEOF_CURL_SOCKLEN_T_already_defined -#endif - -#ifdef CURL_SIZEOF_CURL_SOCKLEN_T -# error "CURL_SIZEOF_CURL_SOCKLEN_T shall not be defined except in curlbuild.h" - Error Compilation_aborted_CURL_SIZEOF_CURL_SOCKLEN_T_already_defined -#endif - -#ifdef CURL_TYPEOF_CURL_OFF_T -# error "CURL_TYPEOF_CURL_OFF_T shall not be defined except in curlbuild.h" - Error Compilation_aborted_CURL_TYPEOF_CURL_OFF_T_already_defined -#endif - -#ifdef CURL_FORMAT_CURL_OFF_T -# error "CURL_FORMAT_CURL_OFF_T shall not be defined except in curlbuild.h" - Error Compilation_aborted_CURL_FORMAT_CURL_OFF_T_already_defined -#endif - -#ifdef CURL_FORMAT_CURL_OFF_TU -# error "CURL_FORMAT_CURL_OFF_TU shall not be defined except in curlbuild.h" - Error Compilation_aborted_CURL_FORMAT_CURL_OFF_TU_already_defined -#endif - -#ifdef CURL_FORMAT_OFF_T -# error "CURL_FORMAT_OFF_T shall not be defined except in curlbuild.h" - Error Compilation_aborted_CURL_FORMAT_OFF_T_already_defined -#endif - -#ifdef CURL_SIZEOF_CURL_OFF_T -# error "CURL_SIZEOF_CURL_OFF_T shall not be defined except in curlbuild.h" - Error Compilation_aborted_CURL_SIZEOF_CURL_OFF_T_already_defined -#endif - -#ifdef CURL_SUFFIX_CURL_OFF_T -# error "CURL_SUFFIX_CURL_OFF_T shall not be defined except in curlbuild.h" - Error Compilation_aborted_CURL_SUFFIX_CURL_OFF_T_already_defined -#endif - -#ifdef CURL_SUFFIX_CURL_OFF_TU -# error "CURL_SUFFIX_CURL_OFF_TU shall not be defined except in curlbuild.h" - Error Compilation_aborted_CURL_SUFFIX_CURL_OFF_TU_already_defined -#endif - -/* ================================================================ */ -/* EXTERNAL INTERFACE SETTINGS FOR NON-CONFIGURE SYSTEMS ONLY */ -/* ================================================================ */ - -#if defined(__DJGPP__) || defined(__GO32__) -# if defined(__DJGPP__) && (__DJGPP__ > 1) -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T long long -# define CURL_FORMAT_CURL_OFF_T "lld" -# define CURL_FORMAT_CURL_OFF_TU "llu" -# define CURL_FORMAT_OFF_T "%lld" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T LL -# define CURL_SUFFIX_CURL_OFF_TU ULL -# else -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T long -# define CURL_FORMAT_CURL_OFF_T "ld" -# define CURL_FORMAT_CURL_OFF_TU "lu" -# define CURL_FORMAT_OFF_T "%ld" -# define CURL_SIZEOF_CURL_OFF_T 4 -# define CURL_SUFFIX_CURL_OFF_T L -# define CURL_SUFFIX_CURL_OFF_TU UL -# endif -# define CURL_TYPEOF_CURL_SOCKLEN_T int -# define CURL_SIZEOF_CURL_SOCKLEN_T 4 - -#elif defined(__SALFORDC__) -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T long -# define CURL_FORMAT_CURL_OFF_T "ld" -# define CURL_FORMAT_CURL_OFF_TU "lu" -# define CURL_FORMAT_OFF_T "%ld" -# define CURL_SIZEOF_CURL_OFF_T 4 -# define CURL_SUFFIX_CURL_OFF_T L -# define CURL_SUFFIX_CURL_OFF_TU UL -# define CURL_TYPEOF_CURL_SOCKLEN_T int -# define CURL_SIZEOF_CURL_SOCKLEN_T 4 - -#elif defined(__BORLANDC__) -# if (__BORLANDC__ < 0x520) -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T long -# define CURL_FORMAT_CURL_OFF_T "ld" -# define CURL_FORMAT_CURL_OFF_TU "lu" -# define CURL_FORMAT_OFF_T "%ld" -# define CURL_SIZEOF_CURL_OFF_T 4 -# define CURL_SUFFIX_CURL_OFF_T L -# define CURL_SUFFIX_CURL_OFF_TU UL -# else -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T __int64 -# define CURL_FORMAT_CURL_OFF_T "I64d" -# define CURL_FORMAT_CURL_OFF_TU "I64u" -# define CURL_FORMAT_OFF_T "%I64d" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T i64 -# define CURL_SUFFIX_CURL_OFF_TU ui64 -# endif -# define CURL_TYPEOF_CURL_SOCKLEN_T int -# define CURL_SIZEOF_CURL_SOCKLEN_T 4 - -#elif defined(__TURBOC__) -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T long -# define CURL_FORMAT_CURL_OFF_T "ld" -# define CURL_FORMAT_CURL_OFF_TU "lu" -# define CURL_FORMAT_OFF_T "%ld" -# define CURL_SIZEOF_CURL_OFF_T 4 -# define CURL_SUFFIX_CURL_OFF_T L -# define CURL_SUFFIX_CURL_OFF_TU UL -# define CURL_TYPEOF_CURL_SOCKLEN_T int -# define CURL_SIZEOF_CURL_SOCKLEN_T 4 - -#elif defined(__WATCOMC__) -# if defined(__386__) -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T __int64 -# define CURL_FORMAT_CURL_OFF_T "I64d" -# define CURL_FORMAT_CURL_OFF_TU "I64u" -# define CURL_FORMAT_OFF_T "%I64d" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T i64 -# define CURL_SUFFIX_CURL_OFF_TU ui64 -# else -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T long -# define CURL_FORMAT_CURL_OFF_T "ld" -# define CURL_FORMAT_CURL_OFF_TU "lu" -# define CURL_FORMAT_OFF_T "%ld" -# define CURL_SIZEOF_CURL_OFF_T 4 -# define CURL_SUFFIX_CURL_OFF_T L -# define CURL_SUFFIX_CURL_OFF_TU UL -# endif -# define CURL_TYPEOF_CURL_SOCKLEN_T int -# define CURL_SIZEOF_CURL_SOCKLEN_T 4 - -#elif defined(__POCC__) -# if (__POCC__ < 280) -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T long -# define CURL_FORMAT_CURL_OFF_T "ld" -# define CURL_FORMAT_CURL_OFF_TU "lu" -# define CURL_FORMAT_OFF_T "%ld" -# define CURL_SIZEOF_CURL_OFF_T 4 -# define CURL_SUFFIX_CURL_OFF_T L -# define CURL_SUFFIX_CURL_OFF_TU UL -# elif defined(_MSC_VER) -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T __int64 -# define CURL_FORMAT_CURL_OFF_T "I64d" -# define CURL_FORMAT_CURL_OFF_TU "I64u" -# define CURL_FORMAT_OFF_T "%I64d" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T i64 -# define CURL_SUFFIX_CURL_OFF_TU ui64 -# else -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T long long -# define CURL_FORMAT_CURL_OFF_T "lld" -# define CURL_FORMAT_CURL_OFF_TU "llu" -# define CURL_FORMAT_OFF_T "%lld" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T LL -# define CURL_SUFFIX_CURL_OFF_TU ULL -# endif -# define CURL_TYPEOF_CURL_SOCKLEN_T int -# define CURL_SIZEOF_CURL_SOCKLEN_T 4 - -#elif defined(__LCC__) -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T long -# define CURL_FORMAT_CURL_OFF_T "ld" -# define CURL_FORMAT_CURL_OFF_TU "lu" -# define CURL_FORMAT_OFF_T "%ld" -# define CURL_SIZEOF_CURL_OFF_T 4 -# define CURL_SUFFIX_CURL_OFF_T L -# define CURL_SUFFIX_CURL_OFF_TU UL -# define CURL_TYPEOF_CURL_SOCKLEN_T int -# define CURL_SIZEOF_CURL_SOCKLEN_T 4 - -#elif defined(__SYMBIAN32__) -# if defined(__EABI__) /* Treat all ARM compilers equally */ -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T long long -# define CURL_FORMAT_CURL_OFF_T "lld" -# define CURL_FORMAT_CURL_OFF_TU "llu" -# define CURL_FORMAT_OFF_T "%lld" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T LL -# define CURL_SUFFIX_CURL_OFF_TU ULL -# elif defined(__CW32__) -# pragma longlong on -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T long long -# define CURL_FORMAT_CURL_OFF_T "lld" -# define CURL_FORMAT_CURL_OFF_TU "llu" -# define CURL_FORMAT_OFF_T "%lld" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T LL -# define CURL_SUFFIX_CURL_OFF_TU ULL -# elif defined(__VC32__) -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T __int64 -# define CURL_FORMAT_CURL_OFF_T "lld" -# define CURL_FORMAT_CURL_OFF_TU "llu" -# define CURL_FORMAT_OFF_T "%lld" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T LL -# define CURL_SUFFIX_CURL_OFF_TU ULL -# endif -# define CURL_TYPEOF_CURL_SOCKLEN_T unsigned int -# define CURL_SIZEOF_CURL_SOCKLEN_T 4 - -#elif defined(__MWERKS__) -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T long long -# define CURL_FORMAT_CURL_OFF_T "lld" -# define CURL_FORMAT_CURL_OFF_TU "llu" -# define CURL_FORMAT_OFF_T "%lld" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T LL -# define CURL_SUFFIX_CURL_OFF_TU ULL -# define CURL_TYPEOF_CURL_SOCKLEN_T int -# define CURL_SIZEOF_CURL_SOCKLEN_T 4 - -#elif defined(_WIN32_WCE) -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T __int64 -# define CURL_FORMAT_CURL_OFF_T "I64d" -# define CURL_FORMAT_CURL_OFF_TU "I64u" -# define CURL_FORMAT_OFF_T "%I64d" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T i64 -# define CURL_SUFFIX_CURL_OFF_TU ui64 -# define CURL_TYPEOF_CURL_SOCKLEN_T int -# define CURL_SIZEOF_CURL_SOCKLEN_T 4 - -#elif defined(__MINGW32__) -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T long long -# define CURL_FORMAT_CURL_OFF_T "I64d" -# define CURL_FORMAT_CURL_OFF_TU "I64u" -# define CURL_FORMAT_OFF_T "%I64d" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T LL -# define CURL_SUFFIX_CURL_OFF_TU ULL -# define CURL_TYPEOF_CURL_SOCKLEN_T int -# define CURL_SIZEOF_CURL_SOCKLEN_T 4 - -#elif defined(__VMS) -# if defined(__VAX) -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T long -# define CURL_FORMAT_CURL_OFF_T "ld" -# define CURL_FORMAT_CURL_OFF_TU "lu" -# define CURL_FORMAT_OFF_T "%ld" -# define CURL_SIZEOF_CURL_OFF_T 4 -# define CURL_SUFFIX_CURL_OFF_T L -# define CURL_SUFFIX_CURL_OFF_TU UL -# else -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T long long -# define CURL_FORMAT_CURL_OFF_T "lld" -# define CURL_FORMAT_CURL_OFF_TU "llu" -# define CURL_FORMAT_OFF_T "%lld" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T LL -# define CURL_SUFFIX_CURL_OFF_TU ULL -# endif -# define CURL_TYPEOF_CURL_SOCKLEN_T unsigned int -# define CURL_SIZEOF_CURL_SOCKLEN_T 4 - -#elif defined(__OS400__) -# if defined(__ILEC400__) -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T long long -# define CURL_FORMAT_CURL_OFF_T "lld" -# define CURL_FORMAT_CURL_OFF_TU "llu" -# define CURL_FORMAT_OFF_T "%lld" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T LL -# define CURL_SUFFIX_CURL_OFF_TU ULL -# define CURL_TYPEOF_CURL_SOCKLEN_T socklen_t -# define CURL_SIZEOF_CURL_SOCKLEN_T 4 -# define CURL_PULL_SYS_TYPES_H 1 -# define CURL_PULL_SYS_SOCKET_H 1 -# endif - -#elif defined(__MVS__) -# if defined(__IBMC__) || defined(__IBMCPP__) -# if defined(_ILP32) -# define CURL_SIZEOF_LONG 4 -# elif defined(_LP64) -# define CURL_SIZEOF_LONG 8 -# endif -# if defined(_LONG_LONG) -# define CURL_TYPEOF_CURL_OFF_T long long -# define CURL_FORMAT_CURL_OFF_T "lld" -# define CURL_FORMAT_CURL_OFF_TU "llu" -# define CURL_FORMAT_OFF_T "%lld" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T LL -# define CURL_SUFFIX_CURL_OFF_TU ULL -# elif defined(_LP64) -# define CURL_TYPEOF_CURL_OFF_T long -# define CURL_FORMAT_CURL_OFF_T "ld" -# define CURL_FORMAT_CURL_OFF_TU "lu" -# define CURL_FORMAT_OFF_T "%ld" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T L -# define CURL_SUFFIX_CURL_OFF_TU UL -# else -# define CURL_TYPEOF_CURL_OFF_T long -# define CURL_FORMAT_CURL_OFF_T "ld" -# define CURL_FORMAT_CURL_OFF_TU "lu" -# define CURL_FORMAT_OFF_T "%ld" -# define CURL_SIZEOF_CURL_OFF_T 4 -# define CURL_SUFFIX_CURL_OFF_T L -# define CURL_SUFFIX_CURL_OFF_TU UL -# endif -# define CURL_TYPEOF_CURL_SOCKLEN_T socklen_t -# define CURL_SIZEOF_CURL_SOCKLEN_T 4 -# define CURL_PULL_SYS_TYPES_H 1 -# define CURL_PULL_SYS_SOCKET_H 1 -# endif - -#elif defined(__370__) -# if defined(__IBMC__) || defined(__IBMCPP__) -# if defined(_ILP32) -# define CURL_SIZEOF_LONG 4 -# elif defined(_LP64) -# define CURL_SIZEOF_LONG 8 -# endif -# if defined(_LONG_LONG) -# define CURL_TYPEOF_CURL_OFF_T long long -# define CURL_FORMAT_CURL_OFF_T "lld" -# define CURL_FORMAT_CURL_OFF_TU "llu" -# define CURL_FORMAT_OFF_T "%lld" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T LL -# define CURL_SUFFIX_CURL_OFF_TU ULL -# elif defined(_LP64) -# define CURL_TYPEOF_CURL_OFF_T long -# define CURL_FORMAT_CURL_OFF_T "ld" -# define CURL_FORMAT_CURL_OFF_TU "lu" -# define CURL_FORMAT_OFF_T "%ld" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T L -# define CURL_SUFFIX_CURL_OFF_TU UL -# else -# define CURL_TYPEOF_CURL_OFF_T long -# define CURL_FORMAT_CURL_OFF_T "ld" -# define CURL_FORMAT_CURL_OFF_TU "lu" -# define CURL_FORMAT_OFF_T "%ld" -# define CURL_SIZEOF_CURL_OFF_T 4 -# define CURL_SUFFIX_CURL_OFF_T L -# define CURL_SUFFIX_CURL_OFF_TU UL -# endif -# define CURL_TYPEOF_CURL_SOCKLEN_T socklen_t -# define CURL_SIZEOF_CURL_SOCKLEN_T 4 -# define CURL_PULL_SYS_TYPES_H 1 -# define CURL_PULL_SYS_SOCKET_H 1 -# endif - -#elif defined(TPF) -# define CURL_SIZEOF_LONG 8 -# define CURL_TYPEOF_CURL_OFF_T long -# define CURL_FORMAT_CURL_OFF_T "ld" -# define CURL_FORMAT_CURL_OFF_TU "lu" -# define CURL_FORMAT_OFF_T "%ld" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T L -# define CURL_SUFFIX_CURL_OFF_TU UL -# define CURL_TYPEOF_CURL_SOCKLEN_T int -# define CURL_SIZEOF_CURL_SOCKLEN_T 4 - -/* ===================================== */ -/* KEEP MSVC THE PENULTIMATE ENTRY */ -/* ===================================== */ - -#elif defined(_MSC_VER) -# if (_MSC_VER >= 900) && (_INTEGRAL_MAX_BITS >= 64) -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T __int64 -# define CURL_FORMAT_CURL_OFF_T "I64d" -# define CURL_FORMAT_CURL_OFF_TU "I64u" -# define CURL_FORMAT_OFF_T "%I64d" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T i64 -# define CURL_SUFFIX_CURL_OFF_TU ui64 -# else -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T long -# define CURL_FORMAT_CURL_OFF_T "ld" -# define CURL_FORMAT_CURL_OFF_TU "lu" -# define CURL_FORMAT_OFF_T "%ld" -# define CURL_SIZEOF_CURL_OFF_T 4 -# define CURL_SUFFIX_CURL_OFF_T L -# define CURL_SUFFIX_CURL_OFF_TU UL -# endif -# define CURL_TYPEOF_CURL_SOCKLEN_T int -# define CURL_SIZEOF_CURL_SOCKLEN_T 4 - -/* ===================================== */ -/* KEEP GENERIC GCC THE LAST ENTRY */ -/* ===================================== */ - -#elif defined(__GNUC__) -# if defined(__i386__) || defined(__ppc__) -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T long long -# define CURL_FORMAT_CURL_OFF_T "lld" -# define CURL_FORMAT_CURL_OFF_TU "llu" -# define CURL_FORMAT_OFF_T "%lld" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T LL -# define CURL_SUFFIX_CURL_OFF_TU ULL -# elif defined(__x86_64__) || defined(__ppc64__) -# define CURL_SIZEOF_LONG 8 -# define CURL_TYPEOF_CURL_OFF_T long -# define CURL_FORMAT_CURL_OFF_T "ld" -# define CURL_FORMAT_CURL_OFF_TU "lu" -# define CURL_FORMAT_OFF_T "%ld" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T L -# define CURL_SUFFIX_CURL_OFF_TU UL -# endif -# define CURL_TYPEOF_CURL_SOCKLEN_T socklen_t -# define CURL_SIZEOF_CURL_SOCKLEN_T 4 -# define CURL_PULL_SYS_TYPES_H 1 -# define CURL_PULL_SYS_SOCKET_H 1 - -#else -# error "Unknown non-configure build target!" - Error Compilation_aborted_Unknown_non_configure_build_target -#endif - -/* CURL_PULL_SYS_TYPES_H is defined above when inclusion of header file */ -/* sys/types.h is required here to properly make type definitions below. */ -#ifdef CURL_PULL_SYS_TYPES_H -# include <sys/types.h> -#endif - -/* CURL_PULL_SYS_SOCKET_H is defined above when inclusion of header file */ -/* sys/socket.h is required here to properly make type definitions below. */ -#ifdef CURL_PULL_SYS_SOCKET_H -# include <sys/socket.h> -#endif - -/* Data type definition of curl_socklen_t. */ - -#ifdef CURL_TYPEOF_CURL_SOCKLEN_T - typedef CURL_TYPEOF_CURL_SOCKLEN_T curl_socklen_t; -#endif - -/* Data type definition of curl_off_t. */ - -#ifdef CURL_TYPEOF_CURL_OFF_T - typedef CURL_TYPEOF_CURL_OFF_T curl_off_t; -#endif - -#endif /* __CURL_CURLBUILD_H */ diff --git a/src/utils/curl/curlbuild.h.cmake b/src/utils/curl/curlbuild.h.cmake deleted file mode 100644 index 3aa772fc4715dc853ee1c4752f55eaa900d1d2b3..0000000000000000000000000000000000000000 --- a/src/utils/curl/curlbuild.h.cmake +++ /dev/null @@ -1,180 +0,0 @@ -#ifndef __CURL_CURLBUILD_H -#define __CURL_CURLBUILD_H -/*************************************************************************** - * _ _ ____ _ - * Project ___| | | | _ \| | - * / __| | | | |_) | | - * | (__| |_| | _ <| |___ - * \___|\___/|_| \_\_____| - * - * Copyright (C) 1998 - 2008, Daniel Stenberg, <daniel@haxx.se>, et al. - * - * This software is licensed as described in the file COPYING, which - * you should have received as part of this distribution. The terms - * are also available at http://curl.haxx.se/docs/copyright.html. - * - * You may opt to use, copy, modify, merge, publish, distribute and/or sell - * copies of the Software, and permit persons to whom the Software is - * furnished to do so, under the terms of the COPYING file. - * - * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY - * KIND, either express or implied. - * - ***************************************************************************/ - -/* ================================================================ */ -/* NOTES FOR CONFIGURE CAPABLE SYSTEMS */ -/* ================================================================ */ - -/* - * NOTE 1: - * ------- - * - * Nothing in this file is intended to be modified or adjusted by the - * curl library user nor by the curl library builder. - * - * If you think that something actually needs to be changed, adjusted - * or fixed in this file, then, report it on the libcurl development - * mailing list: http://cool.haxx.se/mailman/listinfo/curl-library/ - * - * This header file shall only export symbols which are 'curl' or 'CURL' - * prefixed, otherwise public name space would be polluted. - * - * NOTE 2: - * ------- - * - * Right now you might be staring at file include/curl/curlbuild.h.in or - * at file include/curl/curlbuild.h, this is due to the following reason: - * - * On systems capable of running the configure script, the configure process - * will overwrite the distributed include/curl/curlbuild.h file with one that - * is suitable and specific to the library being configured and built, which - * is generated from the include/curl/curlbuild.h.in template file. - * - */ - -/* ================================================================ */ -/* DEFINITION OF THESE SYMBOLS SHALL NOT TAKE PLACE ANYWHERE ELSE */ -/* ================================================================ */ - -#ifdef CURL_SIZEOF_LONG -# error "CURL_SIZEOF_LONG shall not be defined except in curlbuild.h" - Error Compilation_aborted_CURL_SIZEOF_LONG_already_defined -#endif - -#ifdef CURL_TYPEOF_CURL_SOCKLEN_T -# error "CURL_TYPEOF_CURL_SOCKLEN_T shall not be defined except in curlbuild.h" - Error Compilation_aborted_CURL_TYPEOF_CURL_SOCKLEN_T_already_defined -#endif - -#ifdef CURL_SIZEOF_CURL_SOCKLEN_T -# error "CURL_SIZEOF_CURL_SOCKLEN_T shall not be defined except in curlbuild.h" - Error Compilation_aborted_CURL_SIZEOF_CURL_SOCKLEN_T_already_defined -#endif -#ifdef CURL_TYPEOF_CURL_OFF_T -# error "CURL_TYPEOF_CURL_OFF_T shall not be defined except in curlbuild.h" - Error Compilation_aborted_CURL_TYPEOF_CURL_OFF_T_already_defined -#endif - -#ifdef CURL_FORMAT_CURL_OFF_T -# error "CURL_FORMAT_CURL_OFF_T shall not be defined except in curlbuild.h" - Error Compilation_aborted_CURL_FORMAT_CURL_OFF_T_already_defined -#endif - -#ifdef CURL_FORMAT_CURL_OFF_TU -# error "CURL_FORMAT_CURL_OFF_TU shall not be defined except in curlbuild.h" - Error Compilation_aborted_CURL_FORMAT_CURL_OFF_TU_already_defined -#endif - -#ifdef CURL_FORMAT_OFF_T -# error "CURL_FORMAT_OFF_T shall not be defined except in curlbuild.h" - Error Compilation_aborted_CURL_FORMAT_OFF_T_already_defined -#endif - -#ifdef CURL_SIZEOF_CURL_OFF_T -# error "CURL_SIZEOF_CURL_OFF_T shall not be defined except in curlbuild.h" - Error Compilation_aborted_CURL_SIZEOF_CURL_OFF_T_already_defined -#endif - -#ifdef CURL_SUFFIX_CURL_OFF_T -# error "CURL_SUFFIX_CURL_OFF_T shall not be defined except in curlbuild.h" - Error Compilation_aborted_CURL_SUFFIX_CURL_OFF_T_already_defined -#endif - -#ifdef CURL_SUFFIX_CURL_OFF_TU -# error "CURL_SUFFIX_CURL_OFF_TU shall not be defined except in curlbuild.h" - Error Compilation_aborted_CURL_SUFFIX_CURL_OFF_TU_already_defined -#endif - -/* ================================================================ */ -/* EXTERNAL INTERFACE SETTINGS FOR CONFIGURE CAPABLE SYSTEMS ONLY */ -/* ================================================================ */ - -/* Configure process defines this to 1 when it finds out that system */ -/* header file sys/types.h must be included by the external interface. */ -#cmakedefine CURL_PULL_SYS_TYPES_H ${CURL_PULL_SYS_TYPES_H} -#ifdef CURL_PULL_SYS_TYPES_H -# include <sys/types.h> -#endif - -/* Configure process defines this to 1 when it finds out that system */ -/* header file stdint.h must be included by the external interface. */ -#cmakedefine CURL_PULL_STDINT_H ${CURL_PULL_STDINT_H} -#ifdef CURL_PULL_STDINT_H -# include <stdint.h> -#endif - -/* Configure process defines this to 1 when it finds out that system */ -/* header file inttypes.h must be included by the external interface. */ -#cmakedefine CURL_PULL_INTTYPES_H ${CURL_PULL_INTTYPES_H} -#ifdef CURL_PULL_INTTYPES_H -# include <inttypes.h> -#endif - -/* The size of `long', as computed by sizeof. */ -#cmakedefine CURL_SIZEOF_LONG ${CURL_SIZEOF_LONG} - -/* Integral data type used for curl_socklen_t. */ -#cmakedefine CURL_TYPEOF_CURL_SOCKLEN_T ${CURL_TYPEOF_CURL_SOCKLEN_T} - -/* on windows socklen_t is in here */ -#ifdef _WIN32 -# include <winsock2.h> -# include <ws2tcpip.h> -#endif - -#ifdef HAVE_SYS_SOCKET_H -# include <sys/socket.h> -#endif - -/* Data type definition of curl_socklen_t. */ -typedef CURL_TYPEOF_CURL_SOCKLEN_T curl_socklen_t; - -/* The size of `curl_socklen_t', as computed by sizeof. */ -#cmakedefine CURL_SIZEOF_CURL_SOCKLEN_T ${CURL_SIZEOF_CURL_SOCKLEN_T} - -/* Signed integral data type used for curl_off_t. */ -#cmakedefine CURL_TYPEOF_CURL_OFF_T ${CURL_TYPEOF_CURL_OFF_T} - -/* Data type definition of curl_off_t. */ -typedef CURL_TYPEOF_CURL_OFF_T curl_off_t; - -/* curl_off_t formatting string directive without "%" conversion specifier. */ -#cmakedefine CURL_FORMAT_CURL_OFF_T "${CURL_FORMAT_CURL_OFF_T}" - -/* unsigned curl_off_t formatting string without "%" conversion specifier. */ -#cmakedefine CURL_FORMAT_CURL_OFF_TU "${CURL_FORMAT_CURL_OFF_TU}" - -/* curl_off_t formatting string directive with "%" conversion specifier. */ -#cmakedefine CURL_FORMAT_OFF_T "${CURL_FORMAT_OFF_T}" - -/* The size of `curl_off_t', as computed by sizeof. */ -#cmakedefine CURL_SIZEOF_CURL_OFF_T ${CURL_SIZEOF_CURL_OFF_T} - -/* curl_off_t constant suffix. */ -#cmakedefine CURL_SUFFIX_CURL_OFF_T ${CURL_SUFFIX_CURL_OFF_T} - -/* unsigned curl_off_t constant suffix. */ -#cmakedefine CURL_SUFFIX_CURL_OFF_TU ${CURL_SUFFIX_CURL_OFF_TU} - -#endif /* __CURL_CURLBUILD_H */ diff --git a/src/utils/curl/curlbuild.h.in b/src/utils/curl/curlbuild.h.in deleted file mode 100644 index cb1de80a03e6ec7f4c1c3ed410ccfb3d76e1f17e..0000000000000000000000000000000000000000 --- a/src/utils/curl/curlbuild.h.in +++ /dev/null @@ -1,190 +0,0 @@ -#ifndef __CURL_CURLBUILD_H -#define __CURL_CURLBUILD_H -/*************************************************************************** - * _ _ ____ _ - * Project ___| | | | _ \| | - * / __| | | | |_) | | - * | (__| |_| | _ <| |___ - * \___|\___/|_| \_\_____| - * - * Copyright (C) 1998 - 2009, Daniel Stenberg, <daniel@haxx.se>, et al. - * - * This software is licensed as described in the file COPYING, which - * you should have received as part of this distribution. The terms - * are also available at http://curl.haxx.se/docs/copyright.html. - * - * You may opt to use, copy, modify, merge, publish, distribute and/or sell - * copies of the Software, and permit persons to whom the Software is - * furnished to do so, under the terms of the COPYING file. - * - * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY - * KIND, either express or implied. - * - ***************************************************************************/ - -/* ================================================================ */ -/* NOTES FOR CONFIGURE CAPABLE SYSTEMS */ -/* ================================================================ */ - -/* - * NOTE 1: - * ------- - * - * Nothing in this file is intended to be modified or adjusted by the - * curl library user nor by the curl library builder. - * - * If you think that something actually needs to be changed, adjusted - * or fixed in this file, then, report it on the libcurl development - * mailing list: http://cool.haxx.se/mailman/listinfo/curl-library/ - * - * This header file shall only export symbols which are 'curl' or 'CURL' - * prefixed, otherwise public name space would be polluted. - * - * NOTE 2: - * ------- - * - * Right now you might be staring at file include/curl/curlbuild.h.in or - * at file include/curl/curlbuild.h, this is due to the following reason: - * - * On systems capable of running the configure script, the configure process - * will overwrite the distributed include/curl/curlbuild.h file with one that - * is suitable and specific to the library being configured and built, which - * is generated from the include/curl/curlbuild.h.in template file. - * - */ - -/* ================================================================ */ -/* DEFINITION OF THESE SYMBOLS SHALL NOT TAKE PLACE ANYWHERE ELSE */ -/* ================================================================ */ - -#ifdef CURL_SIZEOF_LONG -# error "CURL_SIZEOF_LONG shall not be defined except in curlbuild.h" - Error Compilation_aborted_CURL_SIZEOF_LONG_already_defined -#endif - -#ifdef CURL_TYPEOF_CURL_SOCKLEN_T -# error "CURL_TYPEOF_CURL_SOCKLEN_T shall not be defined except in curlbuild.h" - Error Compilation_aborted_CURL_TYPEOF_CURL_SOCKLEN_T_already_defined -#endif - -#ifdef CURL_SIZEOF_CURL_SOCKLEN_T -# error "CURL_SIZEOF_CURL_SOCKLEN_T shall not be defined except in curlbuild.h" - Error Compilation_aborted_CURL_SIZEOF_CURL_SOCKLEN_T_already_defined -#endif - -#ifdef CURL_TYPEOF_CURL_OFF_T -# error "CURL_TYPEOF_CURL_OFF_T shall not be defined except in curlbuild.h" - Error Compilation_aborted_CURL_TYPEOF_CURL_OFF_T_already_defined -#endif - -#ifdef CURL_FORMAT_CURL_OFF_T -# error "CURL_FORMAT_CURL_OFF_T shall not be defined except in curlbuild.h" - Error Compilation_aborted_CURL_FORMAT_CURL_OFF_T_already_defined -#endif - -#ifdef CURL_FORMAT_CURL_OFF_TU -# error "CURL_FORMAT_CURL_OFF_TU shall not be defined except in curlbuild.h" - Error Compilation_aborted_CURL_FORMAT_CURL_OFF_TU_already_defined -#endif - -#ifdef CURL_FORMAT_OFF_T -# error "CURL_FORMAT_OFF_T shall not be defined except in curlbuild.h" - Error Compilation_aborted_CURL_FORMAT_OFF_T_already_defined -#endif - -#ifdef CURL_SIZEOF_CURL_OFF_T -# error "CURL_SIZEOF_CURL_OFF_T shall not be defined except in curlbuild.h" - Error Compilation_aborted_CURL_SIZEOF_CURL_OFF_T_already_defined -#endif - -#ifdef CURL_SUFFIX_CURL_OFF_T -# error "CURL_SUFFIX_CURL_OFF_T shall not be defined except in curlbuild.h" - Error Compilation_aborted_CURL_SUFFIX_CURL_OFF_T_already_defined -#endif - -#ifdef CURL_SUFFIX_CURL_OFF_TU -# error "CURL_SUFFIX_CURL_OFF_TU shall not be defined except in curlbuild.h" - Error Compilation_aborted_CURL_SUFFIX_CURL_OFF_TU_already_defined -#endif - -/* ================================================================ */ -/* EXTERNAL INTERFACE SETTINGS FOR CONFIGURE CAPABLE SYSTEMS ONLY */ -/* ================================================================ */ - -/* Configure process defines this to 1 when it finds out that system */ -/* header file ws2tcpip.h must be included by the external interface. */ -#undef CURL_PULL_WS2TCPIP_H -#ifdef CURL_PULL_WS2TCPIP_H -# ifndef WIN32_LEAN_AND_MEAN -# define WIN32_LEAN_AND_MEAN -# endif -# include <windows.h> -# include <winsock2.h> -# include <ws2tcpip.h> -#endif - -/* Configure process defines this to 1 when it finds out that system */ -/* header file sys/types.h must be included by the external interface. */ -#undef CURL_PULL_SYS_TYPES_H -#ifdef CURL_PULL_SYS_TYPES_H -# include <sys/types.h> -#endif - -/* Configure process defines this to 1 when it finds out that system */ -/* header file stdint.h must be included by the external interface. */ -#undef CURL_PULL_STDINT_H -#ifdef CURL_PULL_STDINT_H -# include <stdint.h> -#endif - -/* Configure process defines this to 1 when it finds out that system */ -/* header file inttypes.h must be included by the external interface. */ -#undef CURL_PULL_INTTYPES_H -#ifdef CURL_PULL_INTTYPES_H -# include <inttypes.h> -#endif - -/* Configure process defines this to 1 when it finds out that system */ -/* header file sys/socket.h must be included by the external interface. */ -#undef CURL_PULL_SYS_SOCKET_H -#ifdef CURL_PULL_SYS_SOCKET_H -# include <sys/socket.h> -#endif - -/* The size of `long', as computed by sizeof. */ -#undef CURL_SIZEOF_LONG - -/* Integral data type used for curl_socklen_t. */ -#undef CURL_TYPEOF_CURL_SOCKLEN_T - -/* The size of `curl_socklen_t', as computed by sizeof. */ -#undef CURL_SIZEOF_CURL_SOCKLEN_T - -/* Data type definition of curl_socklen_t. */ -typedef CURL_TYPEOF_CURL_SOCKLEN_T curl_socklen_t; - -/* Signed integral data type used for curl_off_t. */ -#undef CURL_TYPEOF_CURL_OFF_T - -/* Data type definition of curl_off_t. */ -typedef CURL_TYPEOF_CURL_OFF_T curl_off_t; - -/* curl_off_t formatting string directive without "%" conversion specifier. */ -#undef CURL_FORMAT_CURL_OFF_T - -/* unsigned curl_off_t formatting string without "%" conversion specifier. */ -#undef CURL_FORMAT_CURL_OFF_TU - -/* curl_off_t formatting string directive with "%" conversion specifier. */ -#undef CURL_FORMAT_OFF_T - -/* The size of `curl_off_t', as computed by sizeof. */ -#undef CURL_SIZEOF_CURL_OFF_T - -/* curl_off_t constant suffix. */ -#undef CURL_SUFFIX_CURL_OFF_T - -/* unsigned curl_off_t constant suffix. */ -#undef CURL_SUFFIX_CURL_OFF_TU - -#endif /* __CURL_CURLBUILD_H */ diff --git a/src/utils/curl/curlrules.h b/src/utils/curl/curlrules.h deleted file mode 100644 index 8aad1df678c0d827ca792c16137d17d86963fc3d..0000000000000000000000000000000000000000 --- a/src/utils/curl/curlrules.h +++ /dev/null @@ -1,252 +0,0 @@ -#ifndef __CURL_CURLRULES_H -#define __CURL_CURLRULES_H -/*************************************************************************** - * _ _ ____ _ - * Project ___| | | | _ \| | - * / __| | | | |_) | | - * | (__| |_| | _ <| |___ - * \___|\___/|_| \_\_____| - * - * Copyright (C) 1998 - 2010, Daniel Stenberg, <daniel@haxx.se>, et al. - * - * This software is licensed as described in the file COPYING, which - * you should have received as part of this distribution. The terms - * are also available at http://curl.haxx.se/docs/copyright.html. - * - * You may opt to use, copy, modify, merge, publish, distribute and/or sell - * copies of the Software, and permit persons to whom the Software is - * furnished to do so, under the terms of the COPYING file. - * - * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY - * KIND, either express or implied. - * - ***************************************************************************/ - -/* ================================================================ */ -/* COMPILE TIME SANITY CHECKS */ -/* ================================================================ */ - -/* - * NOTE 1: - * ------- - * - * All checks done in this file are intentionally placed in a public - * header file which is pulled by curl/curl.h when an application is - * being built using an already built libcurl library. Additionally - * this file is also included and used when building the library. - * - * If compilation fails on this file it is certainly sure that the - * problem is elsewhere. It could be a problem in the curlbuild.h - * header file, or simply that you are using different compilation - * settings than those used to build the library. - * - * Nothing in this file is intended to be modified or adjusted by the - * curl library user nor by the curl library builder. - * - * Do not deactivate any check, these are done to make sure that the - * library is properly built and used. - * - * You can find further help on the libcurl development mailing list: - * http://cool.haxx.se/mailman/listinfo/curl-library/ - * - * NOTE 2 - * ------ - * - * Some of the following compile time checks are based on the fact - * that the dimension of a constant array can not be a negative one. - * In this way if the compile time verification fails, the compilation - * will fail issuing an error. The error description wording is compiler - * dependent but it will be quite similar to one of the following: - * - * "negative subscript or subscript is too large" - * "array must have at least one element" - * "-1 is an illegal array size" - * "size of array is negative" - * - * If you are building an application which tries to use an already - * built libcurl library and you are getting this kind of errors on - * this file, it is a clear indication that there is a mismatch between - * how the library was built and how you are trying to use it for your - * application. Your already compiled or binary library provider is the - * only one who can give you the details you need to properly use it. - */ - -/* - * Verify that some macros are actually defined. - */ - -#ifndef CURL_SIZEOF_LONG -# error "CURL_SIZEOF_LONG definition is missing!" - Error Compilation_aborted_CURL_SIZEOF_LONG_is_missing -#endif - -#ifndef CURL_TYPEOF_CURL_SOCKLEN_T -# error "CURL_TYPEOF_CURL_SOCKLEN_T definition is missing!" - Error Compilation_aborted_CURL_TYPEOF_CURL_SOCKLEN_T_is_missing -#endif - -#ifndef CURL_SIZEOF_CURL_SOCKLEN_T -# error "CURL_SIZEOF_CURL_SOCKLEN_T definition is missing!" - Error Compilation_aborted_CURL_SIZEOF_CURL_SOCKLEN_T_is_missing -#endif - -#ifndef CURL_TYPEOF_CURL_OFF_T -# error "CURL_TYPEOF_CURL_OFF_T definition is missing!" - Error Compilation_aborted_CURL_TYPEOF_CURL_OFF_T_is_missing -#endif - -#ifndef CURL_FORMAT_CURL_OFF_T -# error "CURL_FORMAT_CURL_OFF_T definition is missing!" - Error Compilation_aborted_CURL_FORMAT_CURL_OFF_T_is_missing -#endif - -#ifndef CURL_FORMAT_CURL_OFF_TU -# error "CURL_FORMAT_CURL_OFF_TU definition is missing!" - Error Compilation_aborted_CURL_FORMAT_CURL_OFF_TU_is_missing -#endif - -#ifndef CURL_FORMAT_OFF_T -# error "CURL_FORMAT_OFF_T definition is missing!" - Error Compilation_aborted_CURL_FORMAT_OFF_T_is_missing -#endif - -#ifndef CURL_SIZEOF_CURL_OFF_T -# error "CURL_SIZEOF_CURL_OFF_T definition is missing!" - Error Compilation_aborted_CURL_SIZEOF_CURL_OFF_T_is_missing -#endif - -#ifndef CURL_SUFFIX_CURL_OFF_T -# error "CURL_SUFFIX_CURL_OFF_T definition is missing!" - Error Compilation_aborted_CURL_SUFFIX_CURL_OFF_T_is_missing -#endif - -#ifndef CURL_SUFFIX_CURL_OFF_TU -# error "CURL_SUFFIX_CURL_OFF_TU definition is missing!" - Error Compilation_aborted_CURL_SUFFIX_CURL_OFF_TU_is_missing -#endif - -/* - * Macros private to this header file. - */ - -#define CurlchkszEQ(t, s) sizeof(t) == s ? 1 : -1 - -#define CurlchkszGE(t1, t2) sizeof(t1) >= sizeof(t2) ? 1 : -1 - -/* - * Verify that the size previously defined and expected for long - * is the same as the one reported by sizeof() at compile time. - */ - -typedef char - __curl_rule_01__ - [CurlchkszEQ(long, CURL_SIZEOF_LONG)]; - -/* - * Verify that the size previously defined and expected for - * curl_off_t is actually the the same as the one reported - * by sizeof() at compile time. - */ - -typedef char - __curl_rule_02__ - [CurlchkszEQ(curl_off_t, CURL_SIZEOF_CURL_OFF_T)]; - -/* - * Verify at compile time that the size of curl_off_t as reported - * by sizeof() is greater or equal than the one reported for long - * for the current compilation. - */ - -typedef char - __curl_rule_03__ - [CurlchkszGE(curl_off_t, long)]; - -/* - * Verify that the size previously defined and expected for - * curl_socklen_t is actually the the same as the one reported - * by sizeof() at compile time. - */ - -typedef char - __curl_rule_04__ - [CurlchkszEQ(curl_socklen_t, CURL_SIZEOF_CURL_SOCKLEN_T)]; - -/* - * Verify at compile time that the size of curl_socklen_t as reported - * by sizeof() is greater or equal than the one reported for int for - * the current compilation. - */ - -typedef char - __curl_rule_05__ - [CurlchkszGE(curl_socklen_t, int)]; - -/* ================================================================ */ -/* EXTERNALLY AND INTERNALLY VISIBLE DEFINITIONS */ -/* ================================================================ */ - -/* - * CURL_ISOCPP and CURL_OFF_T_C definitions are done here in order to allow - * these to be visible and exported by the external libcurl interface API, - * while also making them visible to the library internals, simply including - * setup.h, without actually needing to include curl.h internally. - * If some day this section would grow big enough, all this should be moved - * to its own header file. - */ - -/* - * Figure out if we can use the ## preprocessor operator, which is supported - * by ISO/ANSI C and C++. Some compilers support it without setting __STDC__ - * or __cplusplus so we need to carefully check for them too. - */ - -#if defined(__STDC__) || defined(_MSC_VER) || defined(__cplusplus) || \ - defined(__HP_aCC) || defined(__BORLANDC__) || defined(__LCC__) || \ - defined(__POCC__) || defined(__SALFORDC__) || defined(__HIGHC__) || \ - defined(__ILEC400__) - /* This compiler is believed to have an ISO compatible preprocessor */ -#define CURL_ISOCPP -#else - /* This compiler is believed NOT to have an ISO compatible preprocessor */ -#undef CURL_ISOCPP -#endif - -/* - * Macros for minimum-width signed and unsigned curl_off_t integer constants. - */ - -#ifdef CURL_ISOCPP -# define __CURL_OFF_T_C_HELPER2(Val,Suffix) Val ## Suffix -#else -# define __CURL_OFF_T_C_HELPER2(Val,Suffix) Val/**/Suffix -#endif -#define __CURL_OFF_T_C_HELPER1(Val,Suffix) __CURL_OFF_T_C_HELPER2(Val,Suffix) -#define CURL_OFF_T_C(Val) __CURL_OFF_T_C_HELPER1(Val,CURL_SUFFIX_CURL_OFF_T) -#define CURL_OFF_TU_C(Val) __CURL_OFF_T_C_HELPER1(Val,CURL_SUFFIX_CURL_OFF_TU) - -/* - * Get rid of macros private to this header file. - */ - -#undef CurlchkszEQ -#undef CurlchkszGE - -/* - * Get rid of macros not intended to exist beyond this point. - */ - -#undef CURL_PULL_WS2TCPIP_H -#undef CURL_PULL_SYS_TYPES_H -#undef CURL_PULL_SYS_SOCKET_H -#undef CURL_PULL_STDINT_H -#undef CURL_PULL_INTTYPES_H - -#undef CURL_TYPEOF_CURL_SOCKLEN_T -#undef CURL_TYPEOF_CURL_OFF_T - -#ifdef CURL_NO_OLDIES -#undef CURL_FORMAT_OFF_T /* not required since 7.19.0 - obsoleted in 7.20.0 */ -#endif - -#endif /* __CURL_CURLRULES_H */ diff --git a/src/utils/curl/curlver.h b/src/utils/curl/curlver.h deleted file mode 100644 index 35ca9a8d57e071d8200705e548a17e708d09a0cc..0000000000000000000000000000000000000000 --- a/src/utils/curl/curlver.h +++ /dev/null @@ -1,69 +0,0 @@ -#ifndef __CURL_CURLVER_H -#define __CURL_CURLVER_H -/*************************************************************************** - * _ _ ____ _ - * Project ___| | | | _ \| | - * / __| | | | |_) | | - * | (__| |_| | _ <| |___ - * \___|\___/|_| \_\_____| - * - * Copyright (C) 1998 - 2010, Daniel Stenberg, <daniel@haxx.se>, et al. - * - * This software is licensed as described in the file COPYING, which - * you should have received as part of this distribution. The terms - * are also available at http://curl.haxx.se/docs/copyright.html. - * - * You may opt to use, copy, modify, merge, publish, distribute and/or sell - * copies of the Software, and permit persons to whom the Software is - * furnished to do so, under the terms of the COPYING file. - * - * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY - * KIND, either express or implied. - * - ***************************************************************************/ - -/* This header file contains nothing but libcurl version info, generated by - a script at release-time. This was made its own header file in 7.11.2 */ - -/* This is the global package copyright */ -#define LIBCURL_COPYRIGHT "1996 - 2010 Daniel Stenberg, <daniel@haxx.se>." - -/* This is the version number of the libcurl package from which this header - file origins: */ -#define LIBCURL_VERSION "7.20.1" - -/* The numeric version number is also available "in parts" by using these - defines: */ -#define LIBCURL_VERSION_MAJOR 7 -#define LIBCURL_VERSION_MINOR 20 -#define LIBCURL_VERSION_PATCH 1 - -/* This is the numeric version of the libcurl version number, meant for easier - parsing and comparions by programs. The LIBCURL_VERSION_NUM define will - always follow this syntax: - - 0xXXYYZZ - - Where XX, YY and ZZ are the main version, release and patch numbers in - hexadecimal (using 8 bits each). All three numbers are always represented - using two digits. 1.2 would appear as "0x010200" while version 9.11.7 - appears as "0x090b07". - - This 6-digit (24 bits) hexadecimal number does not show pre-release number, - and it is always a greater number in a more recent release. It makes - comparisons with greater than and less than work. -*/ -#define LIBCURL_VERSION_NUM 0x071401 - -/* - * This is the date and time when the full source package was created. The - * timestamp is not stored in git, as the timestamp is properly set in the - * tarballs by the maketgz script. - * - * The format of the date should follow this template: - * - * "Mon Feb 12 11:35:33 UTC 2007" - */ -#define LIBCURL_TIMESTAMP "Wed Apr 14 14:55:20 UTC 2010" - -#endif /* __CURL_CURLVER_H */ diff --git a/src/utils/curl/easy.h b/src/utils/curl/easy.h deleted file mode 100644 index 1ddb4fe5a2cb476f1a1278b83c7b576135bd5aa9..0000000000000000000000000000000000000000 --- a/src/utils/curl/easy.h +++ /dev/null @@ -1,102 +0,0 @@ -#ifndef __CURL_EASY_H -#define __CURL_EASY_H -/*************************************************************************** - * _ _ ____ _ - * Project ___| | | | _ \| | - * / __| | | | |_) | | - * | (__| |_| | _ <| |___ - * \___|\___/|_| \_\_____| - * - * Copyright (C) 1998 - 2008, Daniel Stenberg, <daniel@haxx.se>, et al. - * - * This software is licensed as described in the file COPYING, which - * you should have received as part of this distribution. The terms - * are also available at http://curl.haxx.se/docs/copyright.html. - * - * You may opt to use, copy, modify, merge, publish, distribute and/or sell - * copies of the Software, and permit persons to whom the Software is - * furnished to do so, under the terms of the COPYING file. - * - * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY - * KIND, either express or implied. - * - ***************************************************************************/ -#ifdef __cplusplus -extern "C" { -#endif - -CURL_EXTERN CURL *curl_easy_init(void); -CURL_EXTERN CURLcode curl_easy_setopt(CURL *curl, CURLoption option, ...); -CURL_EXTERN CURLcode curl_easy_perform(CURL *curl); -CURL_EXTERN void curl_easy_cleanup(CURL *curl); - -/* - * NAME curl_easy_getinfo() - * - * DESCRIPTION - * - * Request internal information from the curl session with this function. The - * third argument MUST be a pointer to a long, a pointer to a char * or a - * pointer to a double (as the documentation describes elsewhere). The data - * pointed to will be filled in accordingly and can be relied upon only if the - * function returns CURLE_OK. This function is intended to get used *AFTER* a - * performed transfer, all results from this function are undefined until the - * transfer is completed. - */ -CURL_EXTERN CURLcode curl_easy_getinfo(CURL *curl, CURLINFO info, ...); - - -/* - * NAME curl_easy_duphandle() - * - * DESCRIPTION - * - * Creates a new curl session handle with the same options set for the handle - * passed in. Duplicating a handle could only be a matter of cloning data and - * options, internal state info and things like persistant connections cannot - * be transfered. It is useful in multithreaded applications when you can run - * curl_easy_duphandle() for each new thread to avoid a series of identical - * curl_easy_setopt() invokes in every thread. - */ -CURL_EXTERN CURL* curl_easy_duphandle(CURL *curl); - -/* - * NAME curl_easy_reset() - * - * DESCRIPTION - * - * Re-initializes a CURL handle to the default values. This puts back the - * handle to the same state as it was in when it was just created. - * - * It does keep: live connections, the Session ID cache, the DNS cache and the - * cookies. - */ -CURL_EXTERN void curl_easy_reset(CURL *curl); - -/* - * NAME curl_easy_recv() - * - * DESCRIPTION - * - * Receives data from the connected socket. Use after successful - * curl_easy_perform() with CURLOPT_CONNECT_ONLY option. - */ -CURL_EXTERN CURLcode curl_easy_recv(CURL *curl, void *buffer, size_t buflen, - size_t *n); - -/* - * NAME curl_easy_send() - * - * DESCRIPTION - * - * Sends data over the connected socket. Use after successful - * curl_easy_perform() with CURLOPT_CONNECT_ONLY option. - */ -CURL_EXTERN CURLcode curl_easy_send(CURL *curl, const void *buffer, - size_t buflen, size_t *n); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/src/utils/curl/mprintf.h b/src/utils/curl/mprintf.h deleted file mode 100644 index de7dd2f3c360e327e6280372e8a10f335fcf3504..0000000000000000000000000000000000000000 --- a/src/utils/curl/mprintf.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef __CURL_MPRINTF_H -#define __CURL_MPRINTF_H -/*************************************************************************** - * _ _ ____ _ - * Project ___| | | | _ \| | - * / __| | | | |_) | | - * | (__| |_| | _ <| |___ - * \___|\___/|_| \_\_____| - * - * Copyright (C) 1998 - 2006, Daniel Stenberg, <daniel@haxx.se>, et al. - * - * This software is licensed as described in the file COPYING, which - * you should have received as part of this distribution. The terms - * are also available at http://curl.haxx.se/docs/copyright.html. - * - * You may opt to use, copy, modify, merge, publish, distribute and/or sell - * copies of the Software, and permit persons to whom the Software is - * furnished to do so, under the terms of the COPYING file. - * - * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY - * KIND, either express or implied. - * - ***************************************************************************/ - -#include <stdarg.h> -#include <stdio.h> /* needed for FILE */ - -#include "curl.h" - -#ifdef __cplusplus -extern "C" { -#endif - -CURL_EXTERN int curl_mprintf(const char *format, ...); -CURL_EXTERN int curl_mfprintf(FILE *fd, const char *format, ...); -CURL_EXTERN int curl_msprintf(char *buffer, const char *format, ...); -CURL_EXTERN int curl_msnprintf(char *buffer, size_t maxlength, - const char *format, ...); -CURL_EXTERN int curl_mvprintf(const char *format, va_list args); -CURL_EXTERN int curl_mvfprintf(FILE *fd, const char *format, va_list args); -CURL_EXTERN int curl_mvsprintf(char *buffer, const char *format, va_list args); -CURL_EXTERN int curl_mvsnprintf(char *buffer, size_t maxlength, - const char *format, va_list args); -CURL_EXTERN char *curl_maprintf(const char *format, ...); -CURL_EXTERN char *curl_mvaprintf(const char *format, va_list args); - -#ifdef _MPRINTF_REPLACE -# undef printf -# undef fprintf -# undef sprintf -# undef vsprintf -# undef snprintf -# undef vprintf -# undef vfprintf -# undef vsnprintf -# undef aprintf -# undef vaprintf -# define printf curl_mprintf -# define fprintf curl_mfprintf -#ifdef CURLDEBUG -/* When built with CURLDEBUG we define away the sprintf() functions since we - don't want internal code to be using them */ -# define sprintf sprintf_was_used -# define vsprintf vsprintf_was_used -#else -# define sprintf curl_msprintf -# define vsprintf curl_mvsprintf -#endif -# define snprintf curl_msnprintf -# define vprintf curl_mvprintf -# define vfprintf curl_mvfprintf -# define vsnprintf curl_mvsnprintf -# define aprintf curl_maprintf -# define vaprintf curl_mvaprintf -#endif - -#ifdef __cplusplus -} -#endif - -#endif /* __CURL_MPRINTF_H */ diff --git a/src/utils/curl/multi.h b/src/utils/curl/multi.h deleted file mode 100644 index f96566669c6771fbbf2e1735f826d8cb3a1fce11..0000000000000000000000000000000000000000 --- a/src/utils/curl/multi.h +++ /dev/null @@ -1,345 +0,0 @@ -#ifndef __CURL_MULTI_H -#define __CURL_MULTI_H -/*************************************************************************** - * _ _ ____ _ - * Project ___| | | | _ \| | - * / __| | | | |_) | | - * | (__| |_| | _ <| |___ - * \___|\___/|_| \_\_____| - * - * Copyright (C) 1998 - 2007, Daniel Stenberg, <daniel@haxx.se>, et al. - * - * This software is licensed as described in the file COPYING, which - * you should have received as part of this distribution. The terms - * are also available at http://curl.haxx.se/docs/copyright.html. - * - * You may opt to use, copy, modify, merge, publish, distribute and/or sell - * copies of the Software, and permit persons to whom the Software is - * furnished to do so, under the terms of the COPYING file. - * - * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY - * KIND, either express or implied. - * - ***************************************************************************/ -/* - This is an "external" header file. Don't give away any internals here! - - GOALS - - o Enable a "pull" interface. The application that uses libcurl decides where - and when to ask libcurl to get/send data. - - o Enable multiple simultaneous transfers in the same thread without making it - complicated for the application. - - o Enable the application to select() on its own file descriptors and curl's - file descriptors simultaneous easily. - -*/ - -/* - * This header file should not really need to include "curl.h" since curl.h - * itself includes this file and we expect user applications to do #include - * <curl/curl.h> without the need for especially including multi.h. - * - * For some reason we added this include here at one point, and rather than to - * break existing (wrongly written) libcurl applications, we leave it as-is - * but with this warning attached. - */ -#include "curl.h" - -#ifdef __cplusplus -extern "C" { -#endif - -typedef void CURLM; - -typedef enum { - CURLM_CALL_MULTI_PERFORM = -1, /* please call curl_multi_perform() or - curl_multi_socket*() soon */ - CURLM_OK, - CURLM_BAD_HANDLE, /* the passed-in handle is not a valid CURLM handle */ - CURLM_BAD_EASY_HANDLE, /* an easy handle was not good/valid */ - CURLM_OUT_OF_MEMORY, /* if you ever get this, you're in deep sh*t */ - CURLM_INTERNAL_ERROR, /* this is a libcurl bug */ - CURLM_BAD_SOCKET, /* the passed in socket argument did not match */ - CURLM_UNKNOWN_OPTION, /* curl_multi_setopt() with unsupported option */ - CURLM_LAST -} CURLMcode; - -/* just to make code nicer when using curl_multi_socket() you can now check - for CURLM_CALL_MULTI_SOCKET too in the same style it works for - curl_multi_perform() and CURLM_CALL_MULTI_PERFORM */ -#define CURLM_CALL_MULTI_SOCKET CURLM_CALL_MULTI_PERFORM - -typedef enum { - CURLMSG_NONE, /* first, not used */ - CURLMSG_DONE, /* This easy handle has completed. 'result' contains - the CURLcode of the transfer */ - CURLMSG_LAST /* last, not used */ -} CURLMSG; - -struct CURLMsg { - CURLMSG msg; /* what this message means */ - CURL *easy_handle; /* the handle it concerns */ - union { - void *whatever; /* message-specific data */ - CURLcode result; /* return code for transfer */ - } data; -}; -typedef struct CURLMsg CURLMsg; - -/* - * Name: curl_multi_init() - * - * Desc: inititalize multi-style curl usage - * - * Returns: a new CURLM handle to use in all 'curl_multi' functions. - */ -CURL_EXTERN CURLM *curl_multi_init(void); - -/* - * Name: curl_multi_add_handle() - * - * Desc: add a standard curl handle to the multi stack - * - * Returns: CURLMcode type, general multi error code. - */ -CURL_EXTERN CURLMcode curl_multi_add_handle(CURLM *multi_handle, - CURL *curl_handle); - - /* - * Name: curl_multi_remove_handle() - * - * Desc: removes a curl handle from the multi stack again - * - * Returns: CURLMcode type, general multi error code. - */ -CURL_EXTERN CURLMcode curl_multi_remove_handle(CURLM *multi_handle, - CURL *curl_handle); - - /* - * Name: curl_multi_fdset() - * - * Desc: Ask curl for its fd_set sets. The app can use these to select() or - * poll() on. We want curl_multi_perform() called as soon as one of - * them are ready. - * - * Returns: CURLMcode type, general multi error code. - */ -CURL_EXTERN CURLMcode curl_multi_fdset(CURLM *multi_handle, - fd_set *read_fd_set, - fd_set *write_fd_set, - fd_set *exc_fd_set, - int *max_fd); - - /* - * Name: curl_multi_perform() - * - * Desc: When the app thinks there's data available for curl it calls this - * function to read/write whatever there is right now. This returns - * as soon as the reads and writes are done. This function does not - * require that there actually is data available for reading or that - * data can be written, it can be called just in case. It returns - * the number of handles that still transfer data in the second - * argument's integer-pointer. - * - * Returns: CURLMcode type, general multi error code. *NOTE* that this only - * returns errors etc regarding the whole multi stack. There might - * still have occurred problems on invidual transfers even when this - * returns OK. - */ -CURL_EXTERN CURLMcode curl_multi_perform(CURLM *multi_handle, - int *running_handles); - - /* - * Name: curl_multi_cleanup() - * - * Desc: Cleans up and removes a whole multi stack. It does not free or - * touch any individual easy handles in any way. We need to define - * in what state those handles will be if this function is called - * in the middle of a transfer. - * - * Returns: CURLMcode type, general multi error code. - */ -CURL_EXTERN CURLMcode curl_multi_cleanup(CURLM *multi_handle); - -/* - * Name: curl_multi_info_read() - * - * Desc: Ask the multi handle if there's any messages/informationals from - * the individual transfers. Messages include informationals such as - * error code from the transfer or just the fact that a transfer is - * completed. More details on these should be written down as well. - * - * Repeated calls to this function will return a new struct each - * time, until a special "end of msgs" struct is returned as a signal - * that there is no more to get at this point. - * - * The data the returned pointer points to will not survive calling - * curl_multi_cleanup(). - * - * The 'CURLMsg' struct is meant to be very simple and only contain - * very basic informations. If more involved information is wanted, - * we will provide the particular "transfer handle" in that struct - * and that should/could/would be used in subsequent - * curl_easy_getinfo() calls (or similar). The point being that we - * must never expose complex structs to applications, as then we'll - * undoubtably get backwards compatibility problems in the future. - * - * Returns: A pointer to a filled-in struct, or NULL if it failed or ran out - * of structs. It also writes the number of messages left in the - * queue (after this read) in the integer the second argument points - * to. - */ -CURL_EXTERN CURLMsg *curl_multi_info_read(CURLM *multi_handle, - int *msgs_in_queue); - -/* - * Name: curl_multi_strerror() - * - * Desc: The curl_multi_strerror function may be used to turn a CURLMcode - * value into the equivalent human readable error string. This is - * useful for printing meaningful error messages. - * - * Returns: A pointer to a zero-terminated error message. - */ -CURL_EXTERN const char *curl_multi_strerror(CURLMcode); - -/* - * Name: curl_multi_socket() and - * curl_multi_socket_all() - * - * Desc: An alternative version of curl_multi_perform() that allows the - * application to pass in one of the file descriptors that have been - * detected to have "action" on them and let libcurl perform. - * See man page for details. - */ -#define CURL_POLL_NONE 0 -#define CURL_POLL_IN 1 -#define CURL_POLL_OUT 2 -#define CURL_POLL_INOUT 3 -#define CURL_POLL_REMOVE 4 - -#define CURL_SOCKET_TIMEOUT CURL_SOCKET_BAD - -#define CURL_CSELECT_IN 0x01 -#define CURL_CSELECT_OUT 0x02 -#define CURL_CSELECT_ERR 0x04 - -typedef int (*curl_socket_callback)(CURL *easy, /* easy handle */ - curl_socket_t s, /* socket */ - int what, /* see above */ - void *userp, /* private callback - pointer */ - void *socketp); /* private socket - pointer */ -/* - * Name: curl_multi_timer_callback - * - * Desc: Called by libcurl whenever the library detects a change in the - * maximum number of milliseconds the app is allowed to wait before - * curl_multi_socket() or curl_multi_perform() must be called - * (to allow libcurl's timed events to take place). - * - * Returns: The callback should return zero. - */ -typedef int (*curl_multi_timer_callback)(CURLM *multi, /* multi handle */ - long timeout_ms, /* see above */ - void *userp); /* private callback - pointer */ - -CURL_EXTERN CURLMcode curl_multi_socket(CURLM *multi_handle, curl_socket_t s, - int *running_handles); - -CURL_EXTERN CURLMcode curl_multi_socket_action(CURLM *multi_handle, - curl_socket_t s, - int ev_bitmask, - int *running_handles); - -CURL_EXTERN CURLMcode curl_multi_socket_all(CURLM *multi_handle, - int *running_handles); - -#ifndef CURL_ALLOW_OLD_MULTI_SOCKET -/* This macro below was added in 7.16.3 to push users who recompile to use - the new curl_multi_socket_action() instead of the old curl_multi_socket() -*/ -#define curl_multi_socket(x,y,z) curl_multi_socket_action(x,y,0,z) -#endif - -/* - * Name: curl_multi_timeout() - * - * Desc: Returns the maximum number of milliseconds the app is allowed to - * wait before curl_multi_socket() or curl_multi_perform() must be - * called (to allow libcurl's timed events to take place). - * - * Returns: CURLM error code. - */ -CURL_EXTERN CURLMcode curl_multi_timeout(CURLM *multi_handle, - long *milliseconds); - -#undef CINIT /* re-using the same name as in curl.h */ - -#ifdef CURL_ISOCPP -#define CINIT(name,type,num) CURLMOPT_ ## name = CURLOPTTYPE_ ## type + num -#else -/* The macro "##" is ISO C, we assume pre-ISO C doesn't support it. */ -#define LONG CURLOPTTYPE_LONG -#define OBJECTPOINT CURLOPTTYPE_OBJECTPOINT -#define FUNCTIONPOINT CURLOPTTYPE_FUNCTIONPOINT -#define OFF_T CURLOPTTYPE_OFF_T -#define CINIT(name,type,number) CURLMOPT_/**/name = type + number -#endif - -typedef enum { - /* This is the socket callback function pointer */ - CINIT(SOCKETFUNCTION, FUNCTIONPOINT, 1), - - /* This is the argument passed to the socket callback */ - CINIT(SOCKETDATA, OBJECTPOINT, 2), - - /* set to 1 to enable pipelining for this multi handle */ - CINIT(PIPELINING, LONG, 3), - - /* This is the timer callback function pointer */ - CINIT(TIMERFUNCTION, FUNCTIONPOINT, 4), - - /* This is the argument passed to the timer callback */ - CINIT(TIMERDATA, OBJECTPOINT, 5), - - /* maximum number of entries in the connection cache */ - CINIT(MAXCONNECTS, LONG, 6), - - CURLMOPT_LASTENTRY /* the last unused */ -} CURLMoption; - - -/* - * Name: curl_multi_setopt() - * - * Desc: Sets options for the multi handle. - * - * Returns: CURLM error code. - */ -CURL_EXTERN CURLMcode curl_multi_setopt(CURLM *multi_handle, - CURLMoption option, ...); - - -/* - * Name: curl_multi_assign() - * - * Desc: This function sets an association in the multi handle between the - * given socket and a private pointer of the application. This is - * (only) useful for curl_multi_socket uses. - * - * Returns: CURLM error code. - */ -CURL_EXTERN CURLMcode curl_multi_assign(CURLM *multi_handle, - curl_socket_t sockfd, void *sockp); - -#ifdef __cplusplus -} /* end of extern "C" */ -#endif - -#endif diff --git a/src/utils/curl/stdcheaders.h b/src/utils/curl/stdcheaders.h deleted file mode 100644 index ad82ef6335d6167aecf8b9e49d68c462ea028bfd..0000000000000000000000000000000000000000 --- a/src/utils/curl/stdcheaders.h +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef __STDC_HEADERS_H -#define __STDC_HEADERS_H -/*************************************************************************** - * _ _ ____ _ - * Project ___| | | | _ \| | - * / __| | | | |_) | | - * | (__| |_| | _ <| |___ - * \___|\___/|_| \_\_____| - * - * Copyright (C) 1998 - 2010, Daniel Stenberg, <daniel@haxx.se>, et al. - * - * This software is licensed as described in the file COPYING, which - * you should have received as part of this distribution. The terms - * are also available at http://curl.haxx.se/docs/copyright.html. - * - * You may opt to use, copy, modify, merge, publish, distribute and/or sell - * copies of the Software, and permit persons to whom the Software is - * furnished to do so, under the terms of the COPYING file. - * - * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY - * KIND, either express or implied. - * - ***************************************************************************/ - -#include <sys/types.h> - -size_t fread (void *, size_t, size_t, FILE *); -size_t fwrite (const void *, size_t, size_t, FILE *); - -int strcasecmp(const char *, const char *); -int strncasecmp(const char *, const char *, size_t); - -#endif /* __STDC_HEADERS_H */ diff --git a/src/utils/curl/typecheck-gcc.h b/src/utils/curl/typecheck-gcc.h deleted file mode 100644 index c4fad516dd4e232983852213c51983e247b8e525..0000000000000000000000000000000000000000 --- a/src/utils/curl/typecheck-gcc.h +++ /dev/null @@ -1,550 +0,0 @@ -#ifndef __CURL_TYPECHECK_GCC_H -#define __CURL_TYPECHECK_GCC_H -/*************************************************************************** - * _ _ ____ _ - * Project ___| | | | _ \| | - * / __| | | | |_) | | - * | (__| |_| | _ <| |___ - * \___|\___/|_| \_\_____| - * - * Copyright (C) 1998 - 2009, Daniel Stenberg, <daniel@haxx.se>, et al. - * - * This software is licensed as described in the file COPYING, which - * you should have received as part of this distribution. The terms - * are also available at http://curl.haxx.se/docs/copyright.html. - * - * You may opt to use, copy, modify, merge, publish, distribute and/or sell - * copies of the Software, and permit persons to whom the Software is - * furnished to do so, under the terms of the COPYING file. - * - * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY - * KIND, either express or implied. - * - ***************************************************************************/ - -/* wraps curl_easy_setopt() with typechecking */ - -/* To add a new kind of warning, add an - * if(_curl_is_sometype_option(_curl_opt) && ! _curl_is_sometype(value)) - * _curl_easy_setopt_err_sometype(); - * block and define _curl_is_sometype_option, _curl_is_sometype and - * _curl_easy_setopt_err_sometype below - * - * To add an option that uses the same type as an existing option, you'll just - * need to extend the appropriate _curl_*_option macro - */ -#define curl_easy_setopt(handle, option, value) \ -__extension__ ({ \ - __typeof__ (option) _curl_opt = option; \ - if (__builtin_constant_p(_curl_opt)) { \ - if (_curl_is_long_option(_curl_opt) && !_curl_is_long(value)) \ - _curl_easy_setopt_err_long(); \ - if (_curl_is_off_t_option(_curl_opt) && !_curl_is_off_t(value)) \ - _curl_easy_setopt_err_curl_off_t(); \ - if (_curl_is_string_option(_curl_opt) && !_curl_is_string(value)) \ - _curl_easy_setopt_err_string(); \ - if (_curl_is_write_cb_option(_curl_opt) && !_curl_is_write_cb(value)) \ - _curl_easy_setopt_err_write_callback(); \ - if ((_curl_opt) == CURLOPT_READFUNCTION && !_curl_is_read_cb(value)) \ - _curl_easy_setopt_err_read_cb(); \ - if ((_curl_opt) == CURLOPT_IOCTLFUNCTION && !_curl_is_ioctl_cb(value)) \ - _curl_easy_setopt_err_ioctl_cb(); \ - if ((_curl_opt) == CURLOPT_SOCKOPTFUNCTION && !_curl_is_sockopt_cb(value))\ - _curl_easy_setopt_err_sockopt_cb(); \ - if ((_curl_opt) == CURLOPT_OPENSOCKETFUNCTION && \ - !_curl_is_opensocket_cb(value)) \ - _curl_easy_setopt_err_opensocket_cb(); \ - if ((_curl_opt) == CURLOPT_PROGRESSFUNCTION && \ - !_curl_is_progress_cb(value)) \ - _curl_easy_setopt_err_progress_cb(); \ - if ((_curl_opt) == CURLOPT_DEBUGFUNCTION && !_curl_is_debug_cb(value)) \ - _curl_easy_setopt_err_debug_cb(); \ - if ((_curl_opt) == CURLOPT_SSL_CTX_FUNCTION && \ - !_curl_is_ssl_ctx_cb(value)) \ - _curl_easy_setopt_err_ssl_ctx_cb(); \ - if (_curl_is_conv_cb_option(_curl_opt) && !_curl_is_conv_cb(value)) \ - _curl_easy_setopt_err_conv_cb(); \ - if ((_curl_opt) == CURLOPT_SEEKFUNCTION && !_curl_is_seek_cb(value)) \ - _curl_easy_setopt_err_seek_cb(); \ - if (_curl_is_cb_data_option(_curl_opt) && !_curl_is_cb_data(value)) \ - _curl_easy_setopt_err_cb_data(); \ - if ((_curl_opt) == CURLOPT_ERRORBUFFER && !_curl_is_error_buffer(value)) \ - _curl_easy_setopt_err_error_buffer(); \ - if ((_curl_opt) == CURLOPT_STDERR && !_curl_is_FILE(value)) \ - _curl_easy_setopt_err_FILE(); \ - if (_curl_is_postfields_option(_curl_opt) && !_curl_is_postfields(value)) \ - _curl_easy_setopt_err_postfields(); \ - if ((_curl_opt) == CURLOPT_HTTPPOST && \ - !_curl_is_arr((value), struct curl_httppost)) \ - _curl_easy_setopt_err_curl_httpost(); \ - if (_curl_is_slist_option(_curl_opt) && \ - !_curl_is_arr((value), struct curl_slist)) \ - _curl_easy_setopt_err_curl_slist(); \ - if ((_curl_opt) == CURLOPT_SHARE && !_curl_is_ptr((value), CURLSH)) \ - _curl_easy_setopt_err_CURLSH(); \ - } \ - curl_easy_setopt(handle, _curl_opt, value); \ -}) - -/* wraps curl_easy_getinfo() with typechecking */ -/* FIXME: don't allow const pointers */ -#define curl_easy_getinfo(handle, info, arg) \ -__extension__ ({ \ - __typeof__ (info) _curl_info = info; \ - if (__builtin_constant_p(_curl_info)) { \ - if (_curl_is_string_info(_curl_info) && !_curl_is_arr((arg), char *)) \ - _curl_easy_getinfo_err_string(); \ - if (_curl_is_long_info(_curl_info) && !_curl_is_arr((arg), long)) \ - _curl_easy_getinfo_err_long(); \ - if (_curl_is_double_info(_curl_info) && !_curl_is_arr((arg), double)) \ - _curl_easy_getinfo_err_double(); \ - if (_curl_is_slist_info(_curl_info) && \ - !_curl_is_arr((arg), struct curl_slist *)) \ - _curl_easy_getinfo_err_curl_slist(); \ - } \ - curl_easy_getinfo(handle, _curl_info, arg); \ -}) - -/* TODO: typechecking for curl_share_setopt() and curl_multi_setopt(), - * for now just make sure that the functions are called with three - * arguments - */ -#define curl_share_setopt(share,opt,param) curl_share_setopt(share,opt,param) -#define curl_multi_setopt(handle,opt,param) curl_multi_setopt(handle,opt,param) - - -/* the actual warnings, triggered by calling the _curl_easy_setopt_err* - * functions */ - -/* To define a new warning, use _CURL_WARNING(identifier, "message") */ -#define _CURL_WARNING(id, message) \ - static void __attribute__((warning(message))) __attribute__((unused)) \ - __attribute__((noinline)) id(void) { __asm__(""); } - -_CURL_WARNING(_curl_easy_setopt_err_long, - "curl_easy_setopt expects a long argument for this option") -_CURL_WARNING(_curl_easy_setopt_err_curl_off_t, - "curl_easy_setopt expects a curl_off_t argument for this option") -_CURL_WARNING(_curl_easy_setopt_err_string, - "curl_easy_setopt expects a string (char* or char[]) argument for this option" - ) -_CURL_WARNING(_curl_easy_setopt_err_write_callback, - "curl_easy_setopt expects a curl_write_callback argument for this option") -_CURL_WARNING(_curl_easy_setopt_err_read_cb, - "curl_easy_setopt expects a curl_read_callback argument for this option") -_CURL_WARNING(_curl_easy_setopt_err_ioctl_cb, - "curl_easy_setopt expects a curl_ioctl_callback argument for this option") -_CURL_WARNING(_curl_easy_setopt_err_sockopt_cb, - "curl_easy_setopt expects a curl_sockopt_callback argument for this option") -_CURL_WARNING(_curl_easy_setopt_err_opensocket_cb, - "curl_easy_setopt expects a curl_opensocket_callback argument for this option" - ) -_CURL_WARNING(_curl_easy_setopt_err_progress_cb, - "curl_easy_setopt expects a curl_progress_callback argument for this option") -_CURL_WARNING(_curl_easy_setopt_err_debug_cb, - "curl_easy_setopt expects a curl_debug_callback argument for this option") -_CURL_WARNING(_curl_easy_setopt_err_ssl_ctx_cb, - "curl_easy_setopt expects a curl_ssl_ctx_callback argument for this option") -_CURL_WARNING(_curl_easy_setopt_err_conv_cb, - "curl_easy_setopt expects a curl_conv_callback argument for this option") -_CURL_WARNING(_curl_easy_setopt_err_seek_cb, - "curl_easy_setopt expects a curl_seek_callback argument for this option") -_CURL_WARNING(_curl_easy_setopt_err_cb_data, - "curl_easy_setopt expects a private data pointer as argument for this option") -_CURL_WARNING(_curl_easy_setopt_err_error_buffer, - "curl_easy_setopt expects a char buffer of CURL_ERROR_SIZE as argument for this option") -_CURL_WARNING(_curl_easy_setopt_err_FILE, - "curl_easy_setopt expects a FILE* argument for this option") -_CURL_WARNING(_curl_easy_setopt_err_postfields, - "curl_easy_setopt expects a void* or char* argument for this option") -_CURL_WARNING(_curl_easy_setopt_err_curl_httpost, - "curl_easy_setopt expects a struct curl_httppost* argument for this option") -_CURL_WARNING(_curl_easy_setopt_err_curl_slist, - "curl_easy_setopt expects a struct curl_slist* argument for this option") -_CURL_WARNING(_curl_easy_setopt_err_CURLSH, - "curl_easy_setopt expects a CURLSH* argument for this option") - -_CURL_WARNING(_curl_easy_getinfo_err_string, - "curl_easy_getinfo expects a pointer to char * for this info") -_CURL_WARNING(_curl_easy_getinfo_err_long, - "curl_easy_getinfo expects a pointer to long for this info") -_CURL_WARNING(_curl_easy_getinfo_err_double, - "curl_easy_getinfo expects a pointer to double for this info") -_CURL_WARNING(_curl_easy_getinfo_err_curl_slist, - "curl_easy_getinfo expects a pointer to struct curl_slist * for this info") - -/* groups of curl_easy_setops options that take the same type of argument */ - -/* To add a new option to one of the groups, just add - * (option) == CURLOPT_SOMETHING - * to the or-expression. If the option takes a long or curl_off_t, you don't - * have to do anything - */ - -/* evaluates to true if option takes a long argument */ -#define _curl_is_long_option(option) \ - (0 < (option) && (option) < CURLOPTTYPE_OBJECTPOINT) - -#define _curl_is_off_t_option(option) \ - ((option) > CURLOPTTYPE_OFF_T) - -/* evaluates to true if option takes a char* argument */ -#define _curl_is_string_option(option) \ - ((option) == CURLOPT_URL || \ - (option) == CURLOPT_PROXY || \ - (option) == CURLOPT_INTERFACE || \ - (option) == CURLOPT_NETRC_FILE || \ - (option) == CURLOPT_USERPWD || \ - (option) == CURLOPT_USERNAME || \ - (option) == CURLOPT_PASSWORD || \ - (option) == CURLOPT_PROXYUSERPWD || \ - (option) == CURLOPT_PROXYUSERNAME || \ - (option) == CURLOPT_PROXYPASSWORD || \ - (option) == CURLOPT_NOPROXY || \ - (option) == CURLOPT_ENCODING || \ - (option) == CURLOPT_REFERER || \ - (option) == CURLOPT_USERAGENT || \ - (option) == CURLOPT_COOKIE || \ - (option) == CURLOPT_COOKIEFILE || \ - (option) == CURLOPT_COOKIEJAR || \ - (option) == CURLOPT_COOKIELIST || \ - (option) == CURLOPT_FTPPORT || \ - (option) == CURLOPT_FTP_ALTERNATIVE_TO_USER || \ - (option) == CURLOPT_FTP_ACCOUNT || \ - (option) == CURLOPT_RANGE || \ - (option) == CURLOPT_CUSTOMREQUEST || \ - (option) == CURLOPT_SSLCERT || \ - (option) == CURLOPT_SSLCERTTYPE || \ - (option) == CURLOPT_SSLKEY || \ - (option) == CURLOPT_SSLKEYTYPE || \ - (option) == CURLOPT_KEYPASSWD || \ - (option) == CURLOPT_SSLENGINE || \ - (option) == CURLOPT_CAINFO || \ - (option) == CURLOPT_CAPATH || \ - (option) == CURLOPT_RANDOM_FILE || \ - (option) == CURLOPT_EGDSOCKET || \ - (option) == CURLOPT_SSL_CIPHER_LIST || \ - (option) == CURLOPT_KRBLEVEL || \ - (option) == CURLOPT_SSH_HOST_PUBLIC_KEY_MD5 || \ - (option) == CURLOPT_SSH_PUBLIC_KEYFILE || \ - (option) == CURLOPT_SSH_PRIVATE_KEYFILE || \ - (option) == CURLOPT_CRLFILE || \ - (option) == CURLOPT_ISSUERCERT || \ - 0) - -/* evaluates to true if option takes a curl_write_callback argument */ -#define _curl_is_write_cb_option(option) \ - ((option) == CURLOPT_HEADERFUNCTION || \ - (option) == CURLOPT_WRITEFUNCTION) - -/* evaluates to true if option takes a curl_conv_callback argument */ -#define _curl_is_conv_cb_option(option) \ - ((option) == CURLOPT_CONV_TO_NETWORK_FUNCTION || \ - (option) == CURLOPT_CONV_FROM_NETWORK_FUNCTION || \ - (option) == CURLOPT_CONV_FROM_UTF8_FUNCTION) - -/* evaluates to true if option takes a data argument to pass to a callback */ -#define _curl_is_cb_data_option(option) \ - ((option) == CURLOPT_WRITEDATA || \ - (option) == CURLOPT_READDATA || \ - (option) == CURLOPT_IOCTLDATA || \ - (option) == CURLOPT_SOCKOPTDATA || \ - (option) == CURLOPT_OPENSOCKETDATA || \ - (option) == CURLOPT_PROGRESSDATA || \ - (option) == CURLOPT_WRITEHEADER || \ - (option) == CURLOPT_DEBUGDATA || \ - (option) == CURLOPT_SSL_CTX_DATA || \ - (option) == CURLOPT_SEEKDATA || \ - (option) == CURLOPT_PRIVATE || \ - 0) - -/* evaluates to true if option takes a POST data argument (void* or char*) */ -#define _curl_is_postfields_option(option) \ - ((option) == CURLOPT_POSTFIELDS || \ - (option) == CURLOPT_COPYPOSTFIELDS || \ - 0) - -/* evaluates to true if option takes a struct curl_slist * argument */ -#define _curl_is_slist_option(option) \ - ((option) == CURLOPT_HTTPHEADER || \ - (option) == CURLOPT_HTTP200ALIASES || \ - (option) == CURLOPT_QUOTE || \ - (option) == CURLOPT_POSTQUOTE || \ - (option) == CURLOPT_PREQUOTE || \ - (option) == CURLOPT_TELNETOPTIONS || \ - 0) - -/* groups of curl_easy_getinfo infos that take the same type of argument */ - -/* evaluates to true if info expects a pointer to char * argument */ -#define _curl_is_string_info(info) \ - (CURLINFO_STRING < (info) && (info) < CURLINFO_LONG) - -/* evaluates to true if info expects a pointer to long argument */ -#define _curl_is_long_info(info) \ - (CURLINFO_LONG < (info) && (info) < CURLINFO_DOUBLE) - -/* evaluates to true if info expects a pointer to double argument */ -#define _curl_is_double_info(info) \ - (CURLINFO_DOUBLE < (info) && (info) < CURLINFO_SLIST) - -/* true if info expects a pointer to struct curl_slist * argument */ -#define _curl_is_slist_info(info) \ - (CURLINFO_SLIST < (info)) - - -/* typecheck helpers -- check whether given expression has requested type*/ - -/* For pointers, you can use the _curl_is_ptr/_curl_is_arr macros, - * otherwise define a new macro. Search for __builtin_types_compatible_p - * in the GCC manual. - * NOTE: these macros MUST NOT EVALUATE their arguments! The argument is - * the actual expression passed to the curl_easy_setopt macro. This - * means that you can only apply the sizeof and __typeof__ operators, no - * == or whatsoever. - */ - -/* XXX: should evaluate to true iff expr is a pointer */ -#define _curl_is_any_ptr(expr) \ - (sizeof(expr) == sizeof(void*)) - -/* evaluates to true if expr is NULL */ -/* XXX: must not evaluate expr, so this check is not accurate */ -#define _curl_is_NULL(expr) \ - (__builtin_types_compatible_p(__typeof__(expr), __typeof__(NULL))) - -/* evaluates to true if expr is type*, const type* or NULL */ -#define _curl_is_ptr(expr, type) \ - (_curl_is_NULL(expr) || \ - __builtin_types_compatible_p(__typeof__(expr), type *) || \ - __builtin_types_compatible_p(__typeof__(expr), const type *)) - -/* evaluates to true if expr is one of type[], type*, NULL or const type* */ -#define _curl_is_arr(expr, type) \ - (_curl_is_ptr((expr), type) || \ - __builtin_types_compatible_p(__typeof__(expr), type [])) - -/* evaluates to true if expr is a string */ -#define _curl_is_string(expr) \ - (_curl_is_arr((expr), char) || \ - _curl_is_arr((expr), signed char) || \ - _curl_is_arr((expr), unsigned char)) - -/* evaluates to true if expr is a long (no matter the signedness) - * XXX: for now, int is also accepted (and therefore short and char, which - * are promoted to int when passed to a variadic function) */ -#define _curl_is_long(expr) \ - (__builtin_types_compatible_p(__typeof__(expr), long) || \ - __builtin_types_compatible_p(__typeof__(expr), signed long) || \ - __builtin_types_compatible_p(__typeof__(expr), unsigned long) || \ - __builtin_types_compatible_p(__typeof__(expr), int) || \ - __builtin_types_compatible_p(__typeof__(expr), signed int) || \ - __builtin_types_compatible_p(__typeof__(expr), unsigned int) || \ - __builtin_types_compatible_p(__typeof__(expr), short) || \ - __builtin_types_compatible_p(__typeof__(expr), signed short) || \ - __builtin_types_compatible_p(__typeof__(expr), unsigned short) || \ - __builtin_types_compatible_p(__typeof__(expr), char) || \ - __builtin_types_compatible_p(__typeof__(expr), signed char) || \ - __builtin_types_compatible_p(__typeof__(expr), unsigned char)) - -/* evaluates to true if expr is of type curl_off_t */ -#define _curl_is_off_t(expr) \ - (__builtin_types_compatible_p(__typeof__(expr), curl_off_t)) - -/* evaluates to true if expr is abuffer suitable for CURLOPT_ERRORBUFFER */ -/* XXX: also check size of an char[] array? */ -#define _curl_is_error_buffer(expr) \ - (__builtin_types_compatible_p(__typeof__(expr), char *) || \ - __builtin_types_compatible_p(__typeof__(expr), char[])) - -/* evaluates to true if expr is of type (const) void* or (const) FILE* */ -#if 0 -#define _curl_is_cb_data(expr) \ - (_curl_is_ptr((expr), void) || \ - _curl_is_ptr((expr), FILE)) -#else /* be less strict */ -#define _curl_is_cb_data(expr) \ - _curl_is_any_ptr(expr) -#endif - -/* evaluates to true if expr is of type FILE* */ -#define _curl_is_FILE(expr) \ - (__builtin_types_compatible_p(__typeof__(expr), FILE *)) - -/* evaluates to true if expr can be passed as POST data (void* or char*) */ -#define _curl_is_postfields(expr) \ - (_curl_is_ptr((expr), void) || \ - _curl_is_arr((expr), char)) - -/* FIXME: the whole callback checking is messy... - * The idea is to tolerate char vs. void and const vs. not const - * pointers in arguments at least - */ -/* helper: __builtin_types_compatible_p distinguishes between functions and - * function pointers, hide it */ -#define _curl_callback_compatible(func, type) \ - (__builtin_types_compatible_p(__typeof__(func), type) || \ - __builtin_types_compatible_p(__typeof__(func), type*)) - -/* evaluates to true if expr is of type curl_read_callback or "similar" */ -#define _curl_is_read_cb(expr) \ - (_curl_is_NULL(expr) || \ - __builtin_types_compatible_p(__typeof__(expr), __typeof__(fread)) || \ - __builtin_types_compatible_p(__typeof__(expr), curl_read_callback) || \ - _curl_callback_compatible((expr), _curl_read_callback1) || \ - _curl_callback_compatible((expr), _curl_read_callback2) || \ - _curl_callback_compatible((expr), _curl_read_callback3) || \ - _curl_callback_compatible((expr), _curl_read_callback4) || \ - _curl_callback_compatible((expr), _curl_read_callback5) || \ - _curl_callback_compatible((expr), _curl_read_callback6)) -typedef size_t (_curl_read_callback1)(char *, size_t, size_t, void*); -typedef size_t (_curl_read_callback2)(char *, size_t, size_t, const void*); -typedef size_t (_curl_read_callback3)(char *, size_t, size_t, FILE*); -typedef size_t (_curl_read_callback4)(void *, size_t, size_t, void*); -typedef size_t (_curl_read_callback5)(void *, size_t, size_t, const void*); -typedef size_t (_curl_read_callback6)(void *, size_t, size_t, FILE*); - -/* evaluates to true if expr is of type curl_write_callback or "similar" */ -#define _curl_is_write_cb(expr) \ - (_curl_is_read_cb(expr) || \ - __builtin_types_compatible_p(__typeof__(expr), __typeof__(fwrite)) || \ - __builtin_types_compatible_p(__typeof__(expr), curl_write_callback) || \ - _curl_callback_compatible((expr), _curl_write_callback1) || \ - _curl_callback_compatible((expr), _curl_write_callback2) || \ - _curl_callback_compatible((expr), _curl_write_callback3) || \ - _curl_callback_compatible((expr), _curl_write_callback4) || \ - _curl_callback_compatible((expr), _curl_write_callback5) || \ - _curl_callback_compatible((expr), _curl_write_callback6)) -typedef size_t (_curl_write_callback1)(const char *, size_t, size_t, void*); -typedef size_t (_curl_write_callback2)(const char *, size_t, size_t, - const void*); -typedef size_t (_curl_write_callback3)(const char *, size_t, size_t, FILE*); -typedef size_t (_curl_write_callback4)(const void *, size_t, size_t, void*); -typedef size_t (_curl_write_callback5)(const void *, size_t, size_t, - const void*); -typedef size_t (_curl_write_callback6)(const void *, size_t, size_t, FILE*); - -/* evaluates to true if expr is of type curl_ioctl_callback or "similar" */ -#define _curl_is_ioctl_cb(expr) \ - (_curl_is_NULL(expr) || \ - __builtin_types_compatible_p(__typeof__(expr), curl_ioctl_callback) || \ - _curl_callback_compatible((expr), _curl_ioctl_callback1) || \ - _curl_callback_compatible((expr), _curl_ioctl_callback2) || \ - _curl_callback_compatible((expr), _curl_ioctl_callback3) || \ - _curl_callback_compatible((expr), _curl_ioctl_callback4)) -typedef curlioerr (_curl_ioctl_callback1)(CURL *, int, void*); -typedef curlioerr (_curl_ioctl_callback2)(CURL *, int, const void*); -typedef curlioerr (_curl_ioctl_callback3)(CURL *, curliocmd, void*); -typedef curlioerr (_curl_ioctl_callback4)(CURL *, curliocmd, const void*); - -/* evaluates to true if expr is of type curl_sockopt_callback or "similar" */ -#define _curl_is_sockopt_cb(expr) \ - (_curl_is_NULL(expr) || \ - __builtin_types_compatible_p(__typeof__(expr), curl_sockopt_callback) || \ - _curl_callback_compatible((expr), _curl_sockopt_callback1) || \ - _curl_callback_compatible((expr), _curl_sockopt_callback2)) -typedef int (_curl_sockopt_callback1)(void *, curl_socket_t, curlsocktype); -typedef int (_curl_sockopt_callback2)(const void *, curl_socket_t, - curlsocktype); - -/* evaluates to true if expr is of type curl_opensocket_callback or "similar" */ -#define _curl_is_opensocket_cb(expr) \ - (_curl_is_NULL(expr) || \ - __builtin_types_compatible_p(__typeof__(expr), curl_opensocket_callback) ||\ - _curl_callback_compatible((expr), _curl_opensocket_callback1) || \ - _curl_callback_compatible((expr), _curl_opensocket_callback2) || \ - _curl_callback_compatible((expr), _curl_opensocket_callback3) || \ - _curl_callback_compatible((expr), _curl_opensocket_callback4)) -typedef curl_socket_t (_curl_opensocket_callback1) - (void *, curlsocktype, struct curl_sockaddr *); -typedef curl_socket_t (_curl_opensocket_callback2) - (void *, curlsocktype, const struct curl_sockaddr *); -typedef curl_socket_t (_curl_opensocket_callback3) - (const void *, curlsocktype, struct curl_sockaddr *); -typedef curl_socket_t (_curl_opensocket_callback4) - (const void *, curlsocktype, const struct curl_sockaddr *); - -/* evaluates to true if expr is of type curl_progress_callback or "similar" */ -#define _curl_is_progress_cb(expr) \ - (_curl_is_NULL(expr) || \ - __builtin_types_compatible_p(__typeof__(expr), curl_progress_callback) || \ - _curl_callback_compatible((expr), _curl_progress_callback1) || \ - _curl_callback_compatible((expr), _curl_progress_callback2)) -typedef int (_curl_progress_callback1)(void *, - double, double, double, double); -typedef int (_curl_progress_callback2)(const void *, - double, double, double, double); - -/* evaluates to true if expr is of type curl_debug_callback or "similar" */ -#define _curl_is_debug_cb(expr) \ - (_curl_is_NULL(expr) || \ - __builtin_types_compatible_p(__typeof__(expr), curl_debug_callback) || \ - _curl_callback_compatible((expr), _curl_debug_callback1) || \ - _curl_callback_compatible((expr), _curl_debug_callback2) || \ - _curl_callback_compatible((expr), _curl_debug_callback3) || \ - _curl_callback_compatible((expr), _curl_debug_callback4)) -typedef int (_curl_debug_callback1) (CURL *, - curl_infotype, char *, size_t, void *); -typedef int (_curl_debug_callback2) (CURL *, - curl_infotype, char *, size_t, const void *); -typedef int (_curl_debug_callback3) (CURL *, - curl_infotype, const char *, size_t, void *); -typedef int (_curl_debug_callback4) (CURL *, - curl_infotype, const char *, size_t, const void *); - -/* evaluates to true if expr is of type curl_ssl_ctx_callback or "similar" */ -/* this is getting even messier... */ -#define _curl_is_ssl_ctx_cb(expr) \ - (_curl_is_NULL(expr) || \ - __builtin_types_compatible_p(__typeof__(expr), curl_ssl_ctx_callback) || \ - _curl_callback_compatible((expr), _curl_ssl_ctx_callback1) || \ - _curl_callback_compatible((expr), _curl_ssl_ctx_callback2) || \ - _curl_callback_compatible((expr), _curl_ssl_ctx_callback3) || \ - _curl_callback_compatible((expr), _curl_ssl_ctx_callback4) || \ - _curl_callback_compatible((expr), _curl_ssl_ctx_callback5) || \ - _curl_callback_compatible((expr), _curl_ssl_ctx_callback6) || \ - _curl_callback_compatible((expr), _curl_ssl_ctx_callback7) || \ - _curl_callback_compatible((expr), _curl_ssl_ctx_callback8)) -typedef CURLcode (_curl_ssl_ctx_callback1)(CURL *, void *, void *); -typedef CURLcode (_curl_ssl_ctx_callback2)(CURL *, void *, const void *); -typedef CURLcode (_curl_ssl_ctx_callback3)(CURL *, const void *, void *); -typedef CURLcode (_curl_ssl_ctx_callback4)(CURL *, const void *, const void *); -#ifdef HEADER_SSL_H -/* hack: if we included OpenSSL's ssl.h, we know about SSL_CTX - * this will of course break if we're included before OpenSSL headers... - */ -typedef CURLcode (_curl_ssl_ctx_callback5)(CURL *, SSL_CTX, void *); -typedef CURLcode (_curl_ssl_ctx_callback6)(CURL *, SSL_CTX, const void *); -typedef CURLcode (_curl_ssl_ctx_callback7)(CURL *, const SSL_CTX, void *); -typedef CURLcode (_curl_ssl_ctx_callback8)(CURL *, const SSL_CTX, const void *); -#else -typedef _curl_ssl_ctx_callback1 _curl_ssl_ctx_callback5; -typedef _curl_ssl_ctx_callback1 _curl_ssl_ctx_callback6; -typedef _curl_ssl_ctx_callback1 _curl_ssl_ctx_callback7; -typedef _curl_ssl_ctx_callback1 _curl_ssl_ctx_callback8; -#endif - -/* evaluates to true if expr is of type curl_conv_callback or "similar" */ -#define _curl_is_conv_cb(expr) \ - (_curl_is_NULL(expr) || \ - __builtin_types_compatible_p(__typeof__(expr), curl_conv_callback) || \ - _curl_callback_compatible((expr), _curl_conv_callback1) || \ - _curl_callback_compatible((expr), _curl_conv_callback2) || \ - _curl_callback_compatible((expr), _curl_conv_callback3) || \ - _curl_callback_compatible((expr), _curl_conv_callback4)) -typedef CURLcode (*_curl_conv_callback1)(char *, size_t length); -typedef CURLcode (*_curl_conv_callback2)(const char *, size_t length); -typedef CURLcode (*_curl_conv_callback3)(void *, size_t length); -typedef CURLcode (*_curl_conv_callback4)(const void *, size_t length); - -/* evaluates to true if expr is of type curl_seek_callback or "similar" */ -#define _curl_is_seek_cb(expr) \ - (_curl_is_NULL(expr) || \ - __builtin_types_compatible_p(__typeof__(expr), curl_seek_callback) || \ - _curl_callback_compatible((expr), _curl_seek_callback1) || \ - _curl_callback_compatible((expr), _curl_seek_callback2)) -typedef CURLcode (*_curl_seek_callback1)(void *, curl_off_t, int); -typedef CURLcode (*_curl_seek_callback2)(const void *, curl_off_t, int); - - -#endif /* __CURL_TYPECHECK_GCC_H */ diff --git a/src/utils/curl/types.h b/src/utils/curl/types.h deleted file mode 100644 index d37d6ae9e11e48d3f8aaba1957015a03f7f12c63..0000000000000000000000000000000000000000 --- a/src/utils/curl/types.h +++ /dev/null @@ -1 +0,0 @@ -/* not used */ diff --git a/src/utils/fileType/fileType.h b/src/utils/fileType/fileType.h index 3642a4ba3e830cb1f8cdececd6b622f6505fb10e..adf2d6215fd1e277375dbc0d20b74baf4f08a679 100644 --- a/src/utils/fileType/fileType.h +++ b/src/utils/fileType/fileType.h @@ -26,9 +26,9 @@ using namespace std; /***************************************************************************** - Convenience functions to detect whether a given file is + Convenience functions to detect whether a given file is "regular" and/or "gzipped". - + Kindly contributed by Assaf Gordon. ******************************************************************************/ string string_error(int errnum); diff --git a/src/utils/genomeFile/genomeFile.cpp b/src/utils/genomeFile/genomeFile.cpp index 5a69736078383240bdebf10d97a6310b9766b1dc..67a280ea1b681c7caf286e9159db413f9288478a 100644 --- a/src/utils/genomeFile/genomeFile.cpp +++ b/src/utils/genomeFile/genomeFile.cpp @@ -14,8 +14,8 @@ GenomeFile::GenomeFile(const string &genomeFile) { - _genomeFile = genomeFile; - loadGenomeFileIntoMap(); + _genomeFile = genomeFile; + loadGenomeFileIntoMap(); } // Destructor @@ -25,69 +25,69 @@ GenomeFile::~GenomeFile(void) { void GenomeFile::loadGenomeFileIntoMap() { - string genomeLine; - int lineNum = 0; - vector<string> genomeFields; // vector for a GENOME entry - - // open the GENOME file for reading - ifstream genome(_genomeFile.c_str(), ios::in); - if ( !genome ) { - cerr << "Error: The requested genome file (" << _genomeFile << ") could not be opened. Exiting!" << endl; - exit (1); - } - - while (getline(genome, genomeLine)) { - - Tokenize(genomeLine,genomeFields); // load the fields into the vector - lineNum++; - - // ignore a blank line - if (genomeFields.size() > 0) { - if (genomeFields[0].find("#") == string::npos) { - - // we need at least 2 columns - if (genomeFields.size() >= 2) { - char *p2End; - long c2; - // make sure the second column is numeric. - c2 = strtol(genomeFields[1].c_str(), &p2End, 10); - - // strtol will set p2End to the start of the string if non-integral, base 10 - if (p2End != genomeFields[1].c_str()) { - string chrom = genomeFields[0]; - int size = atoi(genomeFields[1].c_str()); - _chromSizes[chrom] = size; - _chromList.push_back(chrom); - } - } - else { - cerr << "Less than the req'd two fields were encountered in the genome file (" << _genomeFile << ")"; - cerr << " at line " << lineNum << ". Exiting." << endl; - exit (1); - } - } - } - genomeFields.clear(); - } + string genomeLine; + int lineNum = 0; + vector<string> genomeFields; // vector for a GENOME entry + + // open the GENOME file for reading + ifstream genome(_genomeFile.c_str(), ios::in); + if ( !genome ) { + cerr << "Error: The requested genome file (" << _genomeFile << ") could not be opened. Exiting!" << endl; + exit (1); + } + + while (getline(genome, genomeLine)) { + + Tokenize(genomeLine,genomeFields); // load the fields into the vector + lineNum++; + + // ignore a blank line + if (genomeFields.size() > 0) { + if (genomeFields[0].find("#") == string::npos) { + + // we need at least 2 columns + if (genomeFields.size() >= 2) { + char *p2End; + long c2; + // make sure the second column is numeric. + c2 = strtol(genomeFields[1].c_str(), &p2End, 10); + + // strtol will set p2End to the start of the string if non-integral, base 10 + if (p2End != genomeFields[1].c_str()) { + string chrom = genomeFields[0]; + int size = atoi(genomeFields[1].c_str()); + _chromSizes[chrom] = size; + _chromList.push_back(chrom); + } + } + else { + cerr << "Less than the req'd two fields were encountered in the genome file (" << _genomeFile << ")"; + cerr << " at line " << lineNum << ". Exiting." << endl; + exit (1); + } + } + } + genomeFields.clear(); + } } int GenomeFile::getChromSize(const string &chrom) { - chromToSizes::const_iterator chromIt = _chromSizes.find(chrom); - if (chromIt != _chromSizes.end()) - return _chromSizes[chrom]; - else - return -1; // chrom not found. + chromToSizes::const_iterator chromIt = _chromSizes.find(chrom); + if (chromIt != _chromSizes.end()) + return _chromSizes[chrom]; + else + return -1; // chrom not found. } vector<string> GenomeFile::getChromList() { - return _chromList; + return _chromList; } int GenomeFile::getNumberOfChroms() { - return _chromList.size(); + return _chromList.size(); } string GenomeFile::getGenomeFileName() { - return _genomeFile; + return _genomeFile; } diff --git a/src/utils/genomeFile/genomeFile.h b/src/utils/genomeFile/genomeFile.h index 9f88f67024635e150fc7b26d8a9f88d04cbe3e85..5da8d6b968c046e046dfca0b46619d18699dcf92 100644 --- a/src/utils/genomeFile/genomeFile.h +++ b/src/utils/genomeFile/genomeFile.h @@ -31,26 +31,26 @@ class GenomeFile { public: - // Constructor - GenomeFile(const string &genomeFile); + // Constructor + GenomeFile(const string &genomeFile); - // Destructor - ~GenomeFile(void); + // Destructor + ~GenomeFile(void); - // load a GENOME file into a map keyed by chrom. value is size of chrom. - void loadGenomeFileIntoMap(); + // load a GENOME file into a map keyed by chrom. value is size of chrom. + void loadGenomeFileIntoMap(); + + int getChromSize(const string &chrom); // return the size of a chromosome + vector<string> getChromList(); // return a list of chrom names + int getNumberOfChroms(); // return the number of chroms + string getGenomeFileName(); // return the name of the genome file - int getChromSize(const string &chrom); // return the size of a chromosome - vector<string> getChromList(); // return a list of chrom names - int getNumberOfChroms(); // return the number of chroms - string getGenomeFileName(); // return the name of the genome file - private: - string _genomeFile; - chromToSizes _chromSizes; - vector<string> _chromList; + string _genomeFile; + chromToSizes _chromSizes; + vector<string> _chromList; }; #endif /* GENOMEFILE_H */ diff --git a/src/utils/gzstream/gzstream.C b/src/utils/gzstream/gzstream.C index 8cb4590e1fb704dd0a3087448b8ee2c8d44c3297..4633c0e928cdcadb16397498081a5d393b3b0897 100644 --- a/src/utils/gzstream/gzstream.C +++ b/src/utils/gzstream/gzstream.C @@ -21,8 +21,8 @@ // Revision : $Revision: 1.7 $ // Revision_date : $Date: 2003/01/08 14:41:27 $ // Author(s) : Deepak Bandyopadhyay, Lutz Kettner -// -// Standard streambuf implementation following Nicolai Josuttis, "The +// +// Standard streambuf implementation following Nicolai Josuttis, "The // Standard C++ Library". // ============================================================================ @@ -97,7 +97,7 @@ int gzstreambuf::underflow() { // used for input buffer only buffer + 4 + num); // end of buffer // return next character - return * reinterpret_cast<unsigned char *>( gptr()); + return * reinterpret_cast<unsigned char *>( gptr()); } int gzstreambuf::flush_buffer() { diff --git a/src/utils/gzstream/gzstream.h b/src/utils/gzstream/gzstream.h index 861653f4810f244e60ec1e049ec60bc903a573ce..85910712151bc6fdd16ab8ca2eb7430a9f0a3155 100644 --- a/src/utils/gzstream/gzstream.h +++ b/src/utils/gzstream/gzstream.h @@ -21,8 +21,8 @@ // Revision : $Revision: 1.5 $ // Revision_date : $Date: 2002/04/26 23:30:15 $ // Author(s) : Deepak Bandyopadhyay, Lutz Kettner -// -// Standard streambuf implementation following Nicolai Josuttis, "The +// +// Standard streambuf implementation following Nicolai Josuttis, "The // Standard C++ Library". // ============================================================================ @@ -58,14 +58,14 @@ public: setp( buffer, buffer + (bufferSize-1)); setg( buffer + 4, // beginning of putback area buffer + 4, // read position - buffer + 4); // end position + buffer + 4); // end position // ASSERT: both input & output capabilities will not be used together } int is_open() { return opened; } gzstreambuf* open( const char* name, int open_mode); gzstreambuf* close(); ~gzstreambuf() { close(); } - + virtual int overflow( int c = EOF); virtual int underflow(); virtual int sync(); @@ -85,15 +85,15 @@ public: // ---------------------------------------------------------------------------- // User classes. Use igzstream and ogzstream analogously to ifstream and -// ofstream respectively. They read and write files based on the gz* +// ofstream respectively. They read and write files based on the gz* // function interface of the zlib. Files are compatible with gzip compression. // ---------------------------------------------------------------------------- class igzstream : public gzstreambase, public std::istream { public: - igzstream() : std::istream( &buf) {} + igzstream() : std::istream( &buf) {} igzstream( const char* name, int open_mode = std::ios::in) - : gzstreambase( name, open_mode), std::istream( &buf) {} + : gzstreambase( name, open_mode), std::istream( &buf) {} gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); } void open( const char* name, int open_mode = std::ios::in) { gzstreambase::open( name, open_mode); @@ -104,7 +104,7 @@ class ogzstream : public gzstreambase, public std::ostream { public: ogzstream() : std::ostream( &buf) {} ogzstream( const char* name, int mode = std::ios::out) - : gzstreambase( name, mode), std::ostream( &buf) {} + : gzstreambase( name, mode), std::ostream( &buf) {} gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); } void open( const char* name, int open_mode = std::ios::out) { gzstreambase::open( name, open_mode); diff --git a/src/utils/lineFileUtilities/lineFileUtilities.cpp b/src/utils/lineFileUtilities/lineFileUtilities.cpp index 3c80fef6296dda942e94d78d0582cbbcbeec92db..81cb6595f23cb964d2b9e953ac47a84c0903889d 100644 --- a/src/utils/lineFileUtilities/lineFileUtilities.cpp +++ b/src/utils/lineFileUtilities/lineFileUtilities.cpp @@ -1,7 +1,7 @@ -// +// // lineFileUtilities.cpp // BEDTools -// +// // Created by Aaron Quinlan Spring 2009. // Copyright 2009 Aaron Quinlan. All rights reserved. // @@ -12,7 +12,7 @@ #include "lineFileUtilities.h" //*********************************************** // lineFileUtilities: -// Common Functions +// Common Functions //*********************************************** // void Tokenize(const string &str, vector<string> &tokens, const string &delimiter) { @@ -20,7 +20,7 @@ // string::size_type lastPos = str.find_first_not_of(delimiter, 0); // // Find first "non-delimiter". // string::size_type pos = str.find_first_of(delimiter, lastPos); -// +// // while (string::npos != pos || string::npos != lastPos) { // // Found a token, add it to the vector. // tokens.push_back(str.substr(lastPos, pos - lastPos)); @@ -30,13 +30,13 @@ // pos = str.find_first_of(delimiter, lastPos); // } // } -// +// // void Tokenize(const string &str, vector<int> &tokens, const string &delimiter) { // // Skip delimiters at beginning. // string::size_type lastPos = str.find_first_not_of(delimiter, 0); // // Find first "non-delimiter". // string::size_type pos = str.find_first_of(delimiter, lastPos); -// +// // while (string::npos != pos || string::npos != lastPos) { // // Found a token, add it to the vector. // tokens.push_back(atoi(str.substr(lastPos, pos - lastPos).c_str())); diff --git a/src/utils/lineFileUtilities/lineFileUtilities.h b/src/utils/lineFileUtilities/lineFileUtilities.h index f1aac59ea1903aad3ca873fe1e203fb7560dcf8b..97d3a5209453fc168054699c46d29963728dbc84 100644 --- a/src/utils/lineFileUtilities/lineFileUtilities.h +++ b/src/utils/lineFileUtilities/lineFileUtilities.h @@ -16,26 +16,26 @@ using namespace std; template <typename T> inline std::string ToString(const T & value) { - std::stringstream ss; - ss << value; - return ss.str(); + std::stringstream ss; + ss << value; + return ss.str(); } inline void Tokenize(const string &str, vector<string> &tokens, const string &delimiter = "\t") { - // Skip delimiters at beginning. - string::size_type lastPos = str.find_first_not_of(delimiter, 0); - // Find first "non-delimiter". - string::size_type pos = str.find_first_of(delimiter, lastPos); - - while (string::npos != pos || string::npos != lastPos) { - // Found a token, add it to the vector. - tokens.push_back(str.substr(lastPos, pos - lastPos)); - // Skip delimiters. Note the "not_of" - lastPos = str.find_first_not_of(delimiter, pos); - // Find next "non-delimiter" - pos = str.find_first_of(delimiter, lastPos); - } + // Skip delimiters at beginning. + string::size_type lastPos = str.find_first_not_of(delimiter, 0); + // Find first "non-delimiter". + string::size_type pos = str.find_first_of(delimiter, lastPos); + + while (string::npos != pos || string::npos != lastPos) { + // Found a token, add it to the vector. + tokens.push_back(str.substr(lastPos, pos - lastPos)); + // Skip delimiters. Note the "not_of" + lastPos = str.find_first_not_of(delimiter, pos); + // Find next "non-delimiter" + pos = str.find_first_of(delimiter, lastPos); + } } inline diff --git a/src/utils/sequenceUtilities/sequenceUtils.cpp b/src/utils/sequenceUtilities/sequenceUtils.cpp index e43b57c3dd8d648089bad7943e1427f493f99c4a..1995c119cc9c28feb5d75e47d2235ea92c1ae9b0 100644 --- a/src/utils/sequenceUtilities/sequenceUtils.cpp +++ b/src/utils/sequenceUtilities/sequenceUtils.cpp @@ -1,7 +1,7 @@ -// +// // sequenceUtils.cpp // BEDTools -// +// // Created by Aaron Quinlan Spring 2009. // Copyright 2009 Aaron Quinlan. All rights reserved. // @@ -14,72 +14,72 @@ // Performs an in-place sequence reversal void reverseSequence(string &sequence) { - std::reverse(sequence.begin(), sequence.end()); + std::reverse(sequence.begin(), sequence.end()); } // Performs an in-place reverse complement conversion void reverseComplement(string &sequence) { - // reverse the sequence - reverseSequence(sequence); + // reverse the sequence + reverseSequence(sequence); - // swap the bases - for(unsigned int i = 0; i < sequence.length(); i++) { - switch(sequence[i]) { - case 'A': - sequence[i] = 'T'; - break; - case 'C': - sequence[i] = 'G'; - break; - case 'G': - sequence[i] = 'C'; - break; - case 'T': - sequence[i] = 'A'; - break; - case 'a': - sequence[i] = 't'; - break; - case 'c': - sequence[i] = 'g'; - break; - case 'g': - sequence[i] = 'c'; - break; - case 't': - sequence[i] = 'a'; - break; - default: - break; - } - } + // swap the bases + for(unsigned int i = 0; i < sequence.length(); i++) { + switch(sequence[i]) { + case 'A': + sequence[i] = 'T'; + break; + case 'C': + sequence[i] = 'G'; + break; + case 'G': + sequence[i] = 'C'; + break; + case 'T': + sequence[i] = 'A'; + break; + case 'a': + sequence[i] = 't'; + break; + case 'c': + sequence[i] = 'g'; + break; + case 'g': + sequence[i] = 'c'; + break; + case 't': + sequence[i] = 'a'; + break; + default: + break; + } + } } void toLowerCase(std::string &str) { - - const int length = str.length(); - for(int i=0; i < length; ++i) - { - str[i] = std::tolower(str[i]); - } - - // alternate, C++ style. - //transform(str.start(), str.end(), str.start(), std::tolower); + + const int length = str.length(); + for(int i=0; i < length; ++i) + { + str[i] = std::tolower(str[i]); + } + + // alternate, C++ style. + //transform(str.start(), str.end(), str.start(), std::tolower); } void toUpperCase(std::string &str) { - - const int length = str.length(); - for(int i=0; i < length; ++i) - { - str[i] = std::toupper(str[i]); - } - - // alternate, C++ style. - //transform(str.start(), str.end(), str.start(), std::toupper); + + const int length = str.length(); + for(int i=0; i < length; ++i) + { + str[i] = std::toupper(str[i]); + } + + // alternate, C++ style. + //transform(str.start(), str.end(), str.start(), std::toupper); } diff --git a/src/utils/sequenceUtilities/sequenceUtils.h b/src/utils/sequenceUtilities/sequenceUtils.h index d71383f9035fd45035f5515bbf076e520d7f8592..206e0fc8d371ce0114c53dcefc3232ad3a9a717b 100644 --- a/src/utils/sequenceUtilities/sequenceUtils.h +++ b/src/utils/sequenceUtilities/sequenceUtils.h @@ -19,4 +19,4 @@ void toLowerCase(string &); // Converts every character in a string to uppercase void toUpperCase(string &); -#endif +#endif diff --git a/src/utils/tabFile/tabFile.cpp b/src/utils/tabFile/tabFile.cpp index c14b12d692bc97a9e47cc50c19623bb365ceece0..7e444426982605268971856039b62f649e3b4b59 100644 --- a/src/utils/tabFile/tabFile.cpp +++ b/src/utils/tabFile/tabFile.cpp @@ -27,73 +27,73 @@ TabFile::~TabFile(void) { void TabFile::Open(void) { - if (_tabFile == "stdin") { - _tabStream = &cin; - } - else { - size_t foundPos; - foundPos = _tabFile.find_last_of(".gz"); - // is this a GZIPPED TAB file? - if (foundPos == _tabFile.size() - 1) { - igzstream tabs(_tabFile.c_str(), ios::in); - if ( !tabs ) { - cerr << "Error: The requested file (" << _tabFile << ") could not be opened. Exiting!" << endl; - exit (1); - } - else { - // if so, close it (this was just a test) - tabs.close(); - // now set a pointer to the stream so that we - // can read the file later on. - _tabStream = new igzstream(_tabFile.c_str(), ios::in); - } - } - // not GZIPPED. - else { - - ifstream tabs(_tabFile.c_str(), ios::in); - // can we open the file? - if ( !tabs ) { - cerr << "Error: The requested file (" << _tabFile << ") could not be opened. Exiting!" << endl; - exit (1); - } - else { - // if so, close it (this was just a test) - tabs.close(); - // now set a pointer to the stream so that we - // can read the file later on. - _tabStream = new ifstream(_tabFile.c_str(), ios::in); - } - } - } + if (_tabFile == "stdin") { + _tabStream = &cin; + } + else { + size_t foundPos; + foundPos = _tabFile.find_last_of(".gz"); + // is this a GZIPPED TAB file? + if (foundPos == _tabFile.size() - 1) { + igzstream tabs(_tabFile.c_str(), ios::in); + if ( !tabs ) { + cerr << "Error: The requested file (" << _tabFile << ") could not be opened. Exiting!" << endl; + exit (1); + } + else { + // if so, close it (this was just a test) + tabs.close(); + // now set a pointer to the stream so that we + // can read the file later on. + _tabStream = new igzstream(_tabFile.c_str(), ios::in); + } + } + // not GZIPPED. + else { + + ifstream tabs(_tabFile.c_str(), ios::in); + // can we open the file? + if ( !tabs ) { + cerr << "Error: The requested file (" << _tabFile << ") could not be opened. Exiting!" << endl; + exit (1); + } + else { + // if so, close it (this was just a test) + tabs.close(); + // now set a pointer to the stream so that we + // can read the file later on. + _tabStream = new ifstream(_tabFile.c_str(), ios::in); + } + } + } } // Close the TAB file void TabFile::Close(void) { - if (_tabFile != "stdin") delete _tabStream; + if (_tabFile != "stdin") delete _tabStream; } TabLineStatus TabFile::GetNextTabLine(TAB_FIELDS &tabFields, int &lineNum) { - // make sure there are still lines to process. - // if so, tokenize, return the TAB_FIELDS. - if (_tabStream->good() == true) { - string tabLine; - tabFields.reserve(20); - - // parse the tabStream pointer - getline(*_tabStream, tabLine); - lineNum++; + // make sure there are still lines to process. + // if so, tokenize, return the TAB_FIELDS. + if (_tabStream->good() == true) { + string tabLine; + tabFields.reserve(20); + + // parse the tabStream pointer + getline(*_tabStream, tabLine); + lineNum++; - // split into a string vector. - Tokenize(tabLine, tabFields); + // split into a string vector. + Tokenize(tabLine, tabFields); // parse the line and validate it - return parseTabLine(tabFields, lineNum); - } - - // default if file is closed or EOF - return TAB_INVALID; + return parseTabLine(tabFields, lineNum); + } + + // default if file is closed or EOF + return TAB_INVALID; } diff --git a/src/utils/tabFile/tabFile.h b/src/utils/tabFile/tabFile.h index a4e87706a6fc42e491c12821d2766ae119107f02..4674ee9bd842c729d80f8bf58684a7546b276392 100644 --- a/src/utils/tabFile/tabFile.h +++ b/src/utils/tabFile/tabFile.h @@ -21,7 +21,7 @@ using namespace std; // enum to flag the state of a given line in a TAB file. enum TabLineStatus -{ +{ TAB_INVALID = -1, TAB_HEADER = 0, TAB_BLANK = 1, @@ -37,43 +37,43 @@ class TabFile { public: - // Constructor - TabFile(const string &tabFile); - - // Destructor - ~TabFile(void); - - // Open a TAB file for reading (creates an istream pointer) - void Open(void); - - // Close an opened TAB file. - void Close(void); - - // Get the next TAB entry in an opened TAB file. - TabLineStatus GetNextTabLine (TAB_FIELDS &tab, int &lineNum); - + // Constructor + TabFile(const string &tabFile); + + // Destructor + ~TabFile(void); + + // Open a TAB file for reading (creates an istream pointer) + void Open(void); + + // Close an opened TAB file. + void Close(void); + + // Get the next TAB entry in an opened TAB file. + TabLineStatus GetNextTabLine (TAB_FIELDS &tab, int &lineNum); + private: - - // data - istream *_tabStream; + + // data + istream *_tabStream; string _tabFile; - - // methods - inline TabLineStatus parseTabLine (const vector<string> &lineVector, int &lineNum) { - // bail out if we have a blank line - if (lineVector.size() == 0) - return TAB_BLANK; + + // methods + inline TabLineStatus parseTabLine (const vector<string> &lineVector, int &lineNum) { + // bail out if we have a blank line + if (lineVector.size() == 0) + return TAB_BLANK; // real line with data - if (lineVector[0][0] != '#') { + if (lineVector[0][0] != '#') { return TAB_VALID; } // comment or header line - else { - lineNum--; - return TAB_HEADER; - } - // default - return TAB_INVALID; + else { + lineNum--; + return TAB_HEADER; + } + // default + return TAB_INVALID; } }; diff --git a/src/utils/version/version.h b/src/utils/version/version.h index 978739666a4e005c607bf6acca9f266b64acdcaf..5338638800443224fc09c5d105bdacefdef93586 100644 --- a/src/utils/version/version.h +++ b/src/utils/version/version.h @@ -1,7 +1,7 @@ #ifndef VERSION_H #define VERSION_H -// define the version. All tools in the +// define the version. All tools in the // suite carry the same version number. #define VERSION "2.10.2" diff --git a/src/windowBed/windowBed.cpp b/src/windowBed/windowBed.cpp index a5dc1a96be7a2dd7911258906918bedb179f2a60..4723e3b087b5d57aeef5be7df04633496e9cc92f 100644 --- a/src/windowBed/windowBed.cpp +++ b/src/windowBed/windowBed.cpp @@ -14,40 +14,40 @@ /* - Constructor + Constructor */ -BedWindow::BedWindow(string bedAFile, string bedBFile, int leftSlop, int rightSlop, - bool anyHit, bool noHit, bool writeCount, bool strandWindows, +BedWindow::BedWindow(string bedAFile, string bedBFile, int leftSlop, int rightSlop, + bool anyHit, bool noHit, bool writeCount, bool strandWindows, bool matchOnStrand, bool bamInput, bool bamOutput, bool isUncompressedBam) { - _bedAFile = bedAFile; - _bedBFile = bedBFile; + _bedAFile = bedAFile; + _bedBFile = bedBFile; - _leftSlop = leftSlop; - _rightSlop = rightSlop; + _leftSlop = leftSlop; + _rightSlop = rightSlop; - _anyHit = anyHit; - _noHit = noHit; - _writeCount = writeCount; - _strandWindows = strandWindows; - _matchOnStrand = matchOnStrand; - _bamInput = bamInput; - _bamOutput = bamOutput; + _anyHit = anyHit; + _noHit = noHit; + _writeCount = writeCount; + _strandWindows = strandWindows; + _matchOnStrand = matchOnStrand; + _bamInput = bamInput; + _bamOutput = bamOutput; _isUncompressedBam = isUncompressedBam; - - _bedA = new BedFile(bedAFile); - _bedB = new BedFile(bedBFile); - - if (_bamInput == false) - WindowIntersectBed(); - else - WindowIntersectBam(_bedAFile); + + _bedA = new BedFile(bedAFile); + _bedB = new BedFile(bedBFile); + + if (_bamInput == false) + WindowIntersectBed(); + else + WindowIntersectBam(_bedAFile); } /* - Destructor + Destructor */ BedWindow::~BedWindow(void) { } @@ -55,187 +55,187 @@ BedWindow::~BedWindow(void) { void BedWindow::FindWindowOverlaps(const BED &a, vector<BED> &hits) { - - /* - Adjust the start and end of a based on the requested window - */ - - // update the current feature's start and end - // according to the slop requested (slop = 0 by default) - CHRPOS aFudgeStart = 0; - CHRPOS aFudgeEnd; - AddWindow(a, aFudgeStart, aFudgeEnd); - - /* - Now report the hits (if any) based on the window around a. - */ - // get the hits in B for the A feature - _bedB->FindOverlapsPerBin(a.chrom, aFudgeStart, aFudgeEnd, a.strand, hits, _matchOnStrand); - - int numOverlaps = 0; - - // loop through the hits and report those that meet the user's criteria - vector<BED>::const_iterator h = hits.begin(); - vector<BED>::const_iterator hitsEnd = hits.end(); - for (; h != hitsEnd; ++h) { - - int s = max(aFudgeStart, h->start); - int e = min(aFudgeEnd, h->end); - int overlapBases = (e - s); // the number of overlapping bases b/w a and b - int aLength = (a.end - a.start); // the length of a in b.p. - - if (s < e) { - // is there enough overlap (default ~ 1bp) - if ( ((float) overlapBases / (float) aLength) > 0 ) { - numOverlaps++; - if (_anyHit == false && _noHit == false && _writeCount == false) { - _bedA->reportBedTab(a); - _bedB->reportBedNewLine(*h); - } - } - } - } - if (_anyHit == true && (numOverlaps >= 1)) { - _bedA->reportBedNewLine(a); } - else if (_writeCount == true) { - _bedA->reportBedTab(a); printf("\t%d\n", numOverlaps); - } - else if (_noHit == true && (numOverlaps == 0)) { - _bedA->reportBedNewLine(a); - } + + /* + Adjust the start and end of a based on the requested window + */ + + // update the current feature's start and end + // according to the slop requested (slop = 0 by default) + CHRPOS aFudgeStart = 0; + CHRPOS aFudgeEnd; + AddWindow(a, aFudgeStart, aFudgeEnd); + + /* + Now report the hits (if any) based on the window around a. + */ + // get the hits in B for the A feature + _bedB->FindOverlapsPerBin(a.chrom, aFudgeStart, aFudgeEnd, a.strand, hits, _matchOnStrand); + + int numOverlaps = 0; + + // loop through the hits and report those that meet the user's criteria + vector<BED>::const_iterator h = hits.begin(); + vector<BED>::const_iterator hitsEnd = hits.end(); + for (; h != hitsEnd; ++h) { + + int s = max(aFudgeStart, h->start); + int e = min(aFudgeEnd, h->end); + int overlapBases = (e - s); // the number of overlapping bases b/w a and b + int aLength = (a.end - a.start); // the length of a in b.p. + + if (s < e) { + // is there enough overlap (default ~ 1bp) + if ( ((float) overlapBases / (float) aLength) > 0 ) { + numOverlaps++; + if (_anyHit == false && _noHit == false && _writeCount == false) { + _bedA->reportBedTab(a); + _bedB->reportBedNewLine(*h); + } + } + } + } + if (_anyHit == true && (numOverlaps >= 1)) { + _bedA->reportBedNewLine(a); } + else if (_writeCount == true) { + _bedA->reportBedTab(a); printf("\t%d\n", numOverlaps); + } + else if (_noHit == true && (numOverlaps == 0)) { + _bedA->reportBedNewLine(a); + } } bool BedWindow::FindOneOrMoreWindowOverlaps(const BED &a) { - - // update the current feature's start and end - // according to the slop requested (slop = 0 by default) - CHRPOS aFudgeStart = 0; - CHRPOS aFudgeEnd; - AddWindow(a, aFudgeStart, aFudgeEnd); - - bool overlapsFound = _bedB->FindOneOrMoreOverlapsPerBin(a.chrom, a.start, a.end, a.strand, _matchOnStrand); - return overlapsFound; + + // update the current feature's start and end + // according to the slop requested (slop = 0 by default) + CHRPOS aFudgeStart = 0; + CHRPOS aFudgeEnd; + AddWindow(a, aFudgeStart, aFudgeEnd); + + bool overlapsFound = _bedB->FindOneOrMoreOverlapsPerBin(a.chrom, a.start, a.end, a.strand, _matchOnStrand); + return overlapsFound; } - + void BedWindow::WindowIntersectBed() { - // load the "B" bed file into a map so - // that we can easily compare "A" to it for overlaps - _bedB->loadBedFileIntoMap(); - - BED a, nullBed; - int lineNum = 0; // current input line number - BedLineStatus bedStatus; - vector<BED> hits; // vector of potential hits - hits.reserve(100); - - _bedA->Open(); - while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) { - if (bedStatus == BED_VALID) { - FindWindowOverlaps(a, hits); - hits.clear(); - a = nullBed; - } - } - _bedA->Close(); + // load the "B" bed file into a map so + // that we can easily compare "A" to it for overlaps + _bedB->loadBedFileIntoMap(); + + BED a, nullBed; + int lineNum = 0; // current input line number + BedLineStatus bedStatus; + vector<BED> hits; // vector of potential hits + hits.reserve(100); + + _bedA->Open(); + while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) { + if (bedStatus == BED_VALID) { + FindWindowOverlaps(a, hits); + hits.clear(); + a = nullBed; + } + } + _bedA->Close(); } void BedWindow::WindowIntersectBam(string bamFile) { - // load the "B" bed file into a map so - // that we can easily compare "A" to it for overlaps - _bedB->loadBedFileIntoMap(); - - // open the BAM file - BamReader reader; - BamWriter writer; - reader.Open(bamFile); - - // get header & reference information - string header = reader.GetHeaderText(); - RefVector refs = reader.GetReferenceData(); - - // open a BAM output to stdout if we are writing BAM - if (_bamOutput == true) { - // open our BAM writer - writer.Open("stdout", header, refs, _isUncompressedBam); - } - - vector<BED> hits; // vector of potential hits - // reserve some space - hits.reserve(100); - - _bedA->bedType = 6; - BamAlignment bam; - bool overlapsFound; - // get each set of alignments for each pair. - while (reader.GetNextAlignment(bam)) { - - if (bam.IsMapped()) { - BED a; - a.chrom = refs.at(bam.RefID).RefName; - a.start = bam.Position; - a.end = bam.GetEndPosition(false); - - // build the name field from the BAM alignment. - a.name = bam.Name; - if (bam.IsFirstMate()) a.name += "/1"; - if (bam.IsSecondMate()) a.name += "/2"; - - a.score = ToString(bam.MapQuality); - a.strand = "+"; if (bam.IsReverseStrand()) a.strand = "-"; - - if (_bamOutput == true) { - overlapsFound = FindOneOrMoreWindowOverlaps(a); - if (overlapsFound == true) { - if (_noHit == false) - writer.SaveAlignment(bam); - } - else { - if (_noHit == true) - writer.SaveAlignment(bam); - } - } - else { - FindWindowOverlaps(a, hits); - hits.clear(); - } - } - } - - // close the relevant BAM files. - reader.Close(); - if (_bamOutput == true) { - writer.Close(); - } + // load the "B" bed file into a map so + // that we can easily compare "A" to it for overlaps + _bedB->loadBedFileIntoMap(); + + // open the BAM file + BamReader reader; + BamWriter writer; + reader.Open(bamFile); + + // get header & reference information + string header = reader.GetHeaderText(); + RefVector refs = reader.GetReferenceData(); + + // open a BAM output to stdout if we are writing BAM + if (_bamOutput == true) { + // open our BAM writer + writer.Open("stdout", header, refs, _isUncompressedBam); + } + + vector<BED> hits; // vector of potential hits + // reserve some space + hits.reserve(100); + + _bedA->bedType = 6; + BamAlignment bam; + bool overlapsFound; + // get each set of alignments for each pair. + while (reader.GetNextAlignment(bam)) { + + if (bam.IsMapped()) { + BED a; + a.chrom = refs.at(bam.RefID).RefName; + a.start = bam.Position; + a.end = bam.GetEndPosition(false); + + // build the name field from the BAM alignment. + a.name = bam.Name; + if (bam.IsFirstMate()) a.name += "/1"; + if (bam.IsSecondMate()) a.name += "/2"; + + a.score = ToString(bam.MapQuality); + a.strand = "+"; if (bam.IsReverseStrand()) a.strand = "-"; + + if (_bamOutput == true) { + overlapsFound = FindOneOrMoreWindowOverlaps(a); + if (overlapsFound == true) { + if (_noHit == false) + writer.SaveAlignment(bam); + } + else { + if (_noHit == true) + writer.SaveAlignment(bam); + } + } + else { + FindWindowOverlaps(a, hits); + hits.clear(); + } + } + } + + // close the relevant BAM files. + reader.Close(); + if (_bamOutput == true) { + writer.Close(); + } } void BedWindow::AddWindow(const BED &a, CHRPOS &fudgeStart, CHRPOS &fudgeEnd) { - // Does the user want to treat the windows based on strand? - // If so, - // if "+", then left is left and right is right - // if "-", the left is right and right is left. - if (_strandWindows) { - if (a.strand == "+") { - if ((a.start - _leftSlop) > 0) fudgeStart = a.start - _leftSlop; - else fudgeStart = 0; - fudgeEnd = a.end + _rightSlop; - } - else { - if ((a.start - _rightSlop) > 0) fudgeStart = a.start - _rightSlop; - else fudgeStart = 0; - fudgeEnd = a.end + _leftSlop; - } - } - // If not, add the windows irrespective of strand - else { - if ((a.start - _leftSlop) > 0) fudgeStart = a.start - _leftSlop; - else fudgeStart = 0; - fudgeEnd = a.end + _rightSlop; - } + // Does the user want to treat the windows based on strand? + // If so, + // if "+", then left is left and right is right + // if "-", the left is right and right is left. + if (_strandWindows) { + if (a.strand == "+") { + if ((a.start - _leftSlop) > 0) fudgeStart = a.start - _leftSlop; + else fudgeStart = 0; + fudgeEnd = a.end + _rightSlop; + } + else { + if ((a.start - _rightSlop) > 0) fudgeStart = a.start - _rightSlop; + else fudgeStart = 0; + fudgeEnd = a.end + _leftSlop; + } + } + // If not, add the windows irrespective of strand + else { + if ((a.start - _leftSlop) > 0) fudgeStart = a.start - _leftSlop; + else fudgeStart = 0; + fudgeEnd = a.end + _rightSlop; + } } diff --git a/src/windowBed/windowBed.h b/src/windowBed/windowBed.h index 229b74adeb4f5f70b349e001513c80dc398a38f9..d2af1849698c99bf1f109a5cf33a97a69982e2f6 100644 --- a/src/windowBed/windowBed.h +++ b/src/windowBed/windowBed.h @@ -31,38 +31,38 @@ class BedWindow { public: - // constructor - BedWindow(string bedAFile, string bedBFile, int leftSlop, int rightSlop, - bool anyHit, bool noHit, bool writeCount, bool strandWindows, - bool matchOnStrand, bool bamInput, bool bamOutput, bool isUncompressedBam); + // constructor + BedWindow(string bedAFile, string bedBFile, int leftSlop, int rightSlop, + bool anyHit, bool noHit, bool writeCount, bool strandWindows, + bool matchOnStrand, bool bamInput, bool bamOutput, bool isUncompressedBam); + + // destructor + ~BedWindow(void); - // destructor - ~BedWindow(void); - private: - string _bedAFile; - string _bedBFile; - bool _anyHit; - bool _writeCount; - int _leftSlop; - int _rightSlop; - bool _noHit; - bool _strandWindows; - bool _matchOnStrand; - bool _bamInput; - bool _bamOutput; - bool _isUncompressedBam; + string _bedAFile; + string _bedBFile; + bool _anyHit; + bool _writeCount; + int _leftSlop; + int _rightSlop; + bool _noHit; + bool _strandWindows; + bool _matchOnStrand; + bool _bamInput; + bool _bamOutput; + bool _isUncompressedBam; + + // instance of a bed file class. + BedFile *_bedA, *_bedB; - // instance of a bed file class. - BedFile *_bedA, *_bedB; - - // methods - void WindowIntersectBed(); - void WindowIntersectBam(string bamFile); - void FindWindowOverlaps(const BED &a, vector<BED> &hits); - bool FindOneOrMoreWindowOverlaps(const BED &a); - void AddWindow(const BED &a, CHRPOS &fudgeStart, CHRPOS &fudgeEnd); + // methods + void WindowIntersectBed(); + void WindowIntersectBam(string bamFile); + void FindWindowOverlaps(const BED &a, vector<BED> &hits); + bool FindOneOrMoreWindowOverlaps(const BED &a); + void AddWindow(const BED &a, CHRPOS &fudgeStart, CHRPOS &fudgeEnd); }; #endif /* WINDOWBED_H */ diff --git a/src/windowBed/windowMain.cpp b/src/windowBed/windowMain.cpp index e3a20411e0d29e8fb760733ddf03547e8db98210..b15854f2e1d099d77e853378734a6cf8a31f9f47 100644 --- a/src/windowBed/windowMain.cpp +++ b/src/windowBed/windowMain.cpp @@ -26,226 +26,226 @@ void ShowHelp(void); int main(int argc, char* argv[]) { - // our configuration variables - bool showHelp = false; - - // input files - string bedAFile; - string bedBFile; - - // input arguments - int leftSlop = 1000; - int rightSlop = 1000; - - bool haveBedA = false; - bool haveBedB = false; - bool noHit = false; - bool anyHit = false; - bool writeCount = false; - bool haveSlop = false; - bool haveLeft = false; - bool haveRight = false; - bool strandWindows = false; - bool matchOnStrand = false; - bool inputIsBam = false; - bool outputIsBam = true; + // our configuration variables + bool showHelp = false; + + // input files + string bedAFile; + string bedBFile; + + // input arguments + int leftSlop = 1000; + int rightSlop = 1000; + + bool haveBedA = false; + bool haveBedB = false; + bool noHit = false; + bool anyHit = false; + bool writeCount = false; + bool haveSlop = false; + bool haveLeft = false; + bool haveRight = false; + bool strandWindows = false; + bool matchOnStrand = false; + bool inputIsBam = false; + bool outputIsBam = true; bool uncompressedBam = false; - // check to see if we should print out some help - if(argc <= 1) showHelp = true; - - for(int i = 1; i < argc; i++) { - int parameterLength = (int)strlen(argv[i]); - - if((PARAMETER_CHECK("-h", 2, parameterLength)) || - (PARAMETER_CHECK("--help", 5, parameterLength))) { - showHelp = true; - } - } - - if(showHelp) ShowHelp(); - - // do some parsing (all of these parameters require 2 strings) - for(int i = 1; i < argc; i++) { - - int parameterLength = (int)strlen(argv[i]); - - if(PARAMETER_CHECK("-a", 2, parameterLength)) { - if ((i+1) < argc) { - haveBedA = true; - bedAFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-abam", 5, parameterLength)) { - if ((i+1) < argc) { - haveBedA = true; - inputIsBam = true; - bedAFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-b", 2, parameterLength)) { - if ((i+1) < argc) { - haveBedB = true; - bedBFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-bed", 4, parameterLength)) { - outputIsBam = false; - } - else if(PARAMETER_CHECK("-u", 2, parameterLength)) { - anyHit = true; - } - else if(PARAMETER_CHECK("-c", 2, parameterLength)) { - writeCount = true; - } - else if (PARAMETER_CHECK("-v", 2, parameterLength)) { - noHit = true; - } - else if (PARAMETER_CHECK("-sw", 3, parameterLength)) { - strandWindows = true; - } - else if (PARAMETER_CHECK("-sm", 3, parameterLength)) { - matchOnStrand = true; - } - else if (PARAMETER_CHECK("-w", 2, parameterLength)) { - if ((i+1) < argc) { - haveSlop = true; - leftSlop = atoi(argv[i + 1]); - rightSlop = leftSlop; - i++; - } - } - else if (PARAMETER_CHECK("-l", 2, parameterLength)) { - if ((i+1) < argc) { - haveLeft = true; - leftSlop = atoi(argv[i + 1]); - i++; - } - } - else if (PARAMETER_CHECK("-r", 2, parameterLength)) { - if ((i+1) < argc) { - haveRight = true; - rightSlop = atoi(argv[i + 1]); - i++; - } - } - else if(PARAMETER_CHECK("-ubam", 5, parameterLength)) { + // check to see if we should print out some help + if(argc <= 1) showHelp = true; + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if((PARAMETER_CHECK("-h", 2, parameterLength)) || + (PARAMETER_CHECK("--help", 5, parameterLength))) { + showHelp = true; + } + } + + if(showHelp) ShowHelp(); + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-a", 2, parameterLength)) { + if ((i+1) < argc) { + haveBedA = true; + bedAFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-abam", 5, parameterLength)) { + if ((i+1) < argc) { + haveBedA = true; + inputIsBam = true; + bedAFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-b", 2, parameterLength)) { + if ((i+1) < argc) { + haveBedB = true; + bedBFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-bed", 4, parameterLength)) { + outputIsBam = false; + } + else if(PARAMETER_CHECK("-u", 2, parameterLength)) { + anyHit = true; + } + else if(PARAMETER_CHECK("-c", 2, parameterLength)) { + writeCount = true; + } + else if (PARAMETER_CHECK("-v", 2, parameterLength)) { + noHit = true; + } + else if (PARAMETER_CHECK("-sw", 3, parameterLength)) { + strandWindows = true; + } + else if (PARAMETER_CHECK("-sm", 3, parameterLength)) { + matchOnStrand = true; + } + else if (PARAMETER_CHECK("-w", 2, parameterLength)) { + if ((i+1) < argc) { + haveSlop = true; + leftSlop = atoi(argv[i + 1]); + rightSlop = leftSlop; + i++; + } + } + else if (PARAMETER_CHECK("-l", 2, parameterLength)) { + if ((i+1) < argc) { + haveLeft = true; + leftSlop = atoi(argv[i + 1]); + i++; + } + } + else if (PARAMETER_CHECK("-r", 2, parameterLength)) { + if ((i+1) < argc) { + haveRight = true; + rightSlop = atoi(argv[i + 1]); + i++; + } + } + else if(PARAMETER_CHECK("-ubam", 5, parameterLength)) { uncompressedBam = true; - } - else { - cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; - showHelp = true; - } - } - - // make sure we have both input files - if (!haveBedA || !haveBedB) { - cerr << endl << "*****" << endl << "*****ERROR: Need -a and -b files. " << endl << "*****" << endl; - showHelp = true; - } - - if (anyHit && noHit) { - cerr << endl << "*****" << endl << "*****ERROR: Request either -u OR -v, not both." << endl << "*****" << endl; - showHelp = true; - } - - if (anyHit && writeCount) { - cerr << endl << "*****" << endl << "*****ERROR: Request either -u OR -c, not both." << endl << "*****" << endl; - showHelp = true; - } - - if (haveLeft && (leftSlop < 0)) { - cerr << endl << "*****" << endl << "*****ERROR: Upstream window (-l) must be positive." << endl << "*****" << endl; - showHelp = true; - } - - if (haveRight && (rightSlop < 0)) { - cerr << endl << "*****" << endl << "*****ERROR: Downstream window (-r) must be positive." << endl << "*****" << endl; - showHelp = true; - } - - if (haveSlop && (haveLeft || haveRight)) { - cerr << endl << "*****" << endl << "*****ERROR: Cannot choose -w with -l or -r. Either specify -l and -r or specify solely -w" << endl << "*****" << endl; - showHelp = true; - } - - if ((haveLeft && !haveRight) || (haveRight && !haveLeft)) { - cerr << endl << "*****" << endl << "*****ERROR: Please specify both -l and -r." << endl << "*****" << endl; - showHelp = true; - } - - if (!showHelp) { - BedWindow *bi = new BedWindow(bedAFile, bedBFile, leftSlop, rightSlop, anyHit, - noHit, writeCount, strandWindows, matchOnStrand, - inputIsBam, outputIsBam, uncompressedBam); - delete bi; - return 0; - } - else { - ShowHelp(); - } + } + else { + cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; + showHelp = true; + } + } + + // make sure we have both input files + if (!haveBedA || !haveBedB) { + cerr << endl << "*****" << endl << "*****ERROR: Need -a and -b files. " << endl << "*****" << endl; + showHelp = true; + } + + if (anyHit && noHit) { + cerr << endl << "*****" << endl << "*****ERROR: Request either -u OR -v, not both." << endl << "*****" << endl; + showHelp = true; + } + + if (anyHit && writeCount) { + cerr << endl << "*****" << endl << "*****ERROR: Request either -u OR -c, not both." << endl << "*****" << endl; + showHelp = true; + } + + if (haveLeft && (leftSlop < 0)) { + cerr << endl << "*****" << endl << "*****ERROR: Upstream window (-l) must be positive." << endl << "*****" << endl; + showHelp = true; + } + + if (haveRight && (rightSlop < 0)) { + cerr << endl << "*****" << endl << "*****ERROR: Downstream window (-r) must be positive." << endl << "*****" << endl; + showHelp = true; + } + + if (haveSlop && (haveLeft || haveRight)) { + cerr << endl << "*****" << endl << "*****ERROR: Cannot choose -w with -l or -r. Either specify -l and -r or specify solely -w" << endl << "*****" << endl; + showHelp = true; + } + + if ((haveLeft && !haveRight) || (haveRight && !haveLeft)) { + cerr << endl << "*****" << endl << "*****ERROR: Please specify both -l and -r." << endl << "*****" << endl; + showHelp = true; + } + + if (!showHelp) { + BedWindow *bi = new BedWindow(bedAFile, bedBFile, leftSlop, rightSlop, anyHit, + noHit, writeCount, strandWindows, matchOnStrand, + inputIsBam, outputIsBam, uncompressedBam); + delete bi; + return 0; + } + else { + ShowHelp(); + } } void ShowHelp(void) { - cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; - - cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; - - cerr << "Summary: Examines a \"window\" around each feature in A and" << endl; - cerr << "\t reports all features in B that overlap the window. For each" << endl; - cerr << "\t overlap the entire entry in A and B are reported." << endl << endl; - - cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <bed/gff/vcf> -b <bed/gff/vcf>" << endl << endl; - - cerr << "Options: " << endl; - - cerr << "\t-abam\t" << "The A input file is in BAM format. Output will be BAM as well." << endl << endl; - - cerr << "\t-ubam\t" << "Write uncompressed BAM output. Default is to write compressed BAM." << endl << endl; - - cerr << "\t-bed\t" << "When using BAM input (-abam), write output as BED. The default" << endl; - cerr << "\t\tis to write output in BAM when using -abam." << endl << endl; - - cerr << "\t-w\t" << "Base pairs added upstream and downstream of each entry" << endl; - cerr << "\t\tin A when searching for overlaps in B." << endl; - cerr << "\t\t- Creates symterical \"windows\" around A." << endl; - cerr << "\t\t- Default is 1000 bp." << endl; - cerr << "\t\t- (INTEGER)" << endl << endl; - - cerr << "\t-l\t" << "Base pairs added upstream (left of) of each entry" << endl; - cerr << "\t\tin A when searching for overlaps in B." << endl; - cerr << "\t\t- Allows one to define assymterical \"windows\"." << endl; - cerr << "\t\t- Default is 1000 bp." << endl; - cerr << "\t\t- (INTEGER)" << endl << endl; - - cerr << "\t-r\t" << "Base pairs added downstream (right of) of each entry" << endl; - cerr << "\t\tin A when searching for overlaps in B." << endl; - cerr << "\t\t- Allows one to define assymterical \"windows\"." << endl; - cerr << "\t\t- Default is 1000 bp." << endl; - cerr << "\t\t- (INTEGER)" << endl << endl; - - cerr << "\t-sw\t" << "Define -l and -r based on strand. For example if used, -l 500" << endl; - cerr << "\t\tfor a negative-stranded feature will add 500 bp downstream." << endl; - cerr << "\t\t- Default = disabled." << endl << endl; - - cerr << "\t-sm\t" << "Only report hits in B that overlap A on the same strand." << endl; - cerr << "\t\t- By default, overlaps are reported without respect to strand." << endl << endl; - - cerr << "\t-u\t" << "Write the original A entry _once_ if _any_ overlaps found in B." << endl; - cerr << "\t\t- In other words, just report the fact >=1 hit was found." << endl << endl; - - cerr << "\t-c\t" << "For each entry in A, report the number of overlaps with B." << endl; - cerr << "\t\t- Reports 0 for A entries that have no overlap with B." << endl; - cerr << "\t\t- Overlaps restricted by -f." << endl << endl; - - cerr << "\t-v\t" << "Only report those entries in A that have _no overlaps_ with B." << endl; - cerr << "\t\t- Similar to \"grep -v.\"" << endl << endl; - - // end the program here - exit(1); + cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; + + cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; + + cerr << "Summary: Examines a \"window\" around each feature in A and" << endl; + cerr << "\t reports all features in B that overlap the window. For each" << endl; + cerr << "\t overlap the entire entry in A and B are reported." << endl << endl; + + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <bed/gff/vcf> -b <bed/gff/vcf>" << endl << endl; + + cerr << "Options: " << endl; + + cerr << "\t-abam\t" << "The A input file is in BAM format. Output will be BAM as well." << endl << endl; + + cerr << "\t-ubam\t" << "Write uncompressed BAM output. Default is to write compressed BAM." << endl << endl; + + cerr << "\t-bed\t" << "When using BAM input (-abam), write output as BED. The default" << endl; + cerr << "\t\tis to write output in BAM when using -abam." << endl << endl; + + cerr << "\t-w\t" << "Base pairs added upstream and downstream of each entry" << endl; + cerr << "\t\tin A when searching for overlaps in B." << endl; + cerr << "\t\t- Creates symterical \"windows\" around A." << endl; + cerr << "\t\t- Default is 1000 bp." << endl; + cerr << "\t\t- (INTEGER)" << endl << endl; + + cerr << "\t-l\t" << "Base pairs added upstream (left of) of each entry" << endl; + cerr << "\t\tin A when searching for overlaps in B." << endl; + cerr << "\t\t- Allows one to define assymterical \"windows\"." << endl; + cerr << "\t\t- Default is 1000 bp." << endl; + cerr << "\t\t- (INTEGER)" << endl << endl; + + cerr << "\t-r\t" << "Base pairs added downstream (right of) of each entry" << endl; + cerr << "\t\tin A when searching for overlaps in B." << endl; + cerr << "\t\t- Allows one to define assymterical \"windows\"." << endl; + cerr << "\t\t- Default is 1000 bp." << endl; + cerr << "\t\t- (INTEGER)" << endl << endl; + + cerr << "\t-sw\t" << "Define -l and -r based on strand. For example if used, -l 500" << endl; + cerr << "\t\tfor a negative-stranded feature will add 500 bp downstream." << endl; + cerr << "\t\t- Default = disabled." << endl << endl; + + cerr << "\t-sm\t" << "Only report hits in B that overlap A on the same strand." << endl; + cerr << "\t\t- By default, overlaps are reported without respect to strand." << endl << endl; + + cerr << "\t-u\t" << "Write the original A entry _once_ if _any_ overlaps found in B." << endl; + cerr << "\t\t- In other words, just report the fact >=1 hit was found." << endl << endl; + + cerr << "\t-c\t" << "For each entry in A, report the number of overlaps with B." << endl; + cerr << "\t\t- Reports 0 for A entries that have no overlap with B." << endl; + cerr << "\t\t- Overlaps restricted by -f." << endl << endl; + + cerr << "\t-v\t" << "Only report those entries in A that have _no overlaps_ with B." << endl; + cerr << "\t\t- Similar to \"grep -v.\"" << endl << endl; + + // end the program here + exit(1); }