#include #include #include #include #include #include #include #include #include #include #include "PGOneII_PerProtein.h" #include "PGOneII_PD.h" using namespace std; void mining(ProjectedDatabase & TempProData, const vector & SequencesDatabase, const string & AASubType, const int & Min_Pat_Length, const double & MinSupRatio, const int & Max_Consective_WC, const int & Max_Ambigu, ofstream & outfSeq); int main(int argc, char *argv[]) { if (argc != 7) { cout << "Welcome to PGOneII (mining type II patterns by pattern growth from one dataset).\n" << "6 arguments are required:\n" << "1. The first argument should be the minimal number \n" << " of frequent items in one pattern should be reported; (3)\n" << "2. The second argument should be the minimal proportion\n" << " of proteins have the pattern; (0.8)\n" << "3. The third argument should be the maximal number of consective" << "wildcards (3)\n" << "4. The fourth argument should be the maximal ambiguousness in the \n" << " number of wildcards; (1) For example, Ax(2,4)T is 2 (4-2=2).\n" << "5. The fifth argument should be the pathway and filename\n" << " of protein sequences in fasta format;\n" << "6. The sixth argument should be the pathway and filename\n" << " of output file." << endl; return 0; } int Min_Pat_Length = atoi(argv[1]); double Min_Sup_ratio = atof(argv[2]); int Max_Consective_WC = atoi(argv[3]); int Max_Ambigu = atoi(argv[4]); ifstream inf_Seq(argv[5]); if (!inf_Seq) cout << "Sorry, cannot find the file: " << argv[5] << endl; char OutputFilename[50]; strcpy(OutputFilename, argv[6]); ofstream outfSeq(OutputFilename); if (!outfSeq) cout << "Sorry, cannot write to the file: " << argv[6] << endl; const string AASubType = "GASTCVLIMPFYWDENQHKR"; // ######################### input sequences database ######################## vector < string > SequencesDatabase; string TempName, TempStr; while ( inf_Seq >> TempName >> TempStr ) { if (!TempStr.empty()) SequencesDatabase.push_back( TempStr + "#"); }; cout << "Size of the database: " << SequencesDatabase.size() << endl; //system("PAUSE"); // ##################### end of input sequences database ##################### // ##################### prefixspan ########################################## ProjectedDatabase EmptyProData; for ( int AAIndex = 0; AAIndex < 20; AAIndex++ ) { ProjectedDatabase ProData(EmptyProData); ProData.InitiateProData(AASubType[AAIndex], SequencesDatabase); cout << "Mining projected database for Prefix " << AASubType[AAIndex] << endl; mining(ProData, SequencesDatabase, AASubType, Min_Pat_Length, Min_Sup_ratio, Max_Consective_WC, Max_Ambigu, outfSeq); } cout << "End of this run!" << endl; return 0; } void mining( ProjectedDatabase & TempProData, const vector & SequencesDatabase, const string & AASubType, const int & Min_Pat_Length, const double & MinSupRatio, const int & Max_Consective_WC, const int & Max_Ambigu, ofstream & outfSeq) { if ( TempProData.GetSupport() < SequencesDatabase.size() * MinSupRatio ) return; if (TempProData.GetPrefixSize() >= Min_Pat_Length) { vector OutPrefix = TempProData.GetPrefix(); vector OutWildcardNumber = TempProData.GetWildcardNumber(); for (int PrefixIndex = 0; PrefixIndex < (int)(OutPrefix.size()); PrefixIndex++) { outfSeq << OutPrefix[PrefixIndex]; if (PrefixIndex!= (int)(OutPrefix.size()-1)) { if (OutWildcardNumber[PrefixIndex*2] < OutWildcardNumber[PrefixIndex*2+1]) outfSeq << "x(" << OutWildcardNumber[PrefixIndex*2] << "," << OutWildcardNumber[PrefixIndex*2+1] << ")"; else if (OutWildcardNumber[PrefixIndex*2] == OutWildcardNumber[PrefixIndex*2+1]) { for (int WilcardOutIndex = 0; WilcardOutIndex < OutWildcardNumber[PrefixIndex*2]; WilcardOutIndex++) outfSeq << "x"; } else cout << "The second wildcard number is bigger than the first one !!" << endl; } } outfSeq << "\t" << TempProData.GetSupport() << endl; } for (int AAIndex = 0; AAIndex < 20; AAIndex++ ) { for (int AmbiguIndex = 0; AmbiguIndex <= Max_Ambigu; AmbiguIndex++) for (int WildcardNumberLowerBoundaryIndex = 0; WildcardNumberLowerBoundaryIndex <= Max_Consective_WC; WildcardNumberLowerBoundaryIndex++) { if (WildcardNumberLowerBoundaryIndex+AmbiguIndex <= Max_Consective_WC) { ProjectedDatabase ProData; ProData.UpdateProData(TempProData, AASubType[AAIndex], SequencesDatabase, WildcardNumberLowerBoundaryIndex, WildcardNumberLowerBoundaryIndex+AmbiguIndex); mining(ProData, SequencesDatabase, AASubType, Min_Pat_Length, MinSupRatio, Max_Consective_WC, Max_Ambigu, outfSeq); } } } }