#include #include #include #include #include #include #include #include #include #include #include "PGTwoII_PerProtein.h" #include "PGTwoII_PD.h" using namespace std; void mining( ProjectedDatabase & ProDataPositive, const vector & SeqPositive, ProjectedDatabase & ProDataNegative, const vector & SeqNegative, const string & AASubType, const int & Min_Pat_Length, const double & Min_Sup_Diff, const int & Max_Consective_WC, const int & Max_Ambigu, const int & Min_Evaluate, ofstream & outfSeq); int main(int argc, char *argv[]) { if (argc != 9) { cout << "Welcome to PGOneII (mining type II patterns by pattern growth from one dataset).\n" << "6 arguments are required:\n" << "1. The first argument should be the minimal number \n" << " of frequent items in one pattern should be reported; (3)\n" << "2. The second argument should be the minimal support difference. (0.8)\n" << "3. The third argument should be the maximal number of consective" << " wildcards (3)\n" << "4. The fourth argument should be the maximal ambiguousness in the \n" << " number of wildcards; (1) For example, Ax(2,4)T is 2 (4-2=2).\n" << "5. The fifth argument should be the minimal number of non-wildcards \n" << " in the patterns before evaluating support difference. (3)\n" << "6. The sixth argument should be the pathway and filename\n" << " of positive dataset in fasta format;\n" << "7. The seventh argument should be the pathway and filename\n" << " of negative dataset in fasta format;\n" << "8. The eighth argument should be the pathway and filename\n" << " of output file." << endl; return 0; } int Min_Pat_Length = atoi(argv[1]); double Min_Sup_Diff = atof(argv[2]); int Max_Consective_WC = atoi(argv[3]); int Max_Ambigu = atoi(argv[4]); int Min_Evaluate = atoi(argv[5]); ifstream inf_Seq_Positive(argv[6]); if (!inf_Seq_Positive) cout << "Sorry, cannot find the file: " << argv[6] << endl; ifstream inf_Seq_Negative(argv[7]); if (!inf_Seq_Negative) cout << "Sorry, cannot find the file: " << argv[7] << endl; char OutputFilename[50]; strcpy(OutputFilename, argv[8]); ofstream outfSeq(OutputFilename); if (!outfSeq) cout << "Sorry, cannot write to the file: " << argv[8] << endl; const string AASubType = "GASTCVLIMPFYWDENQHKR"; // ######################### input sequences database ######################## vector < string > SeqPositive; vector < string > SeqNegative; string TempName, TempStr; while ( inf_Seq_Positive >> TempName >> TempStr ) { if (!TempStr.empty()) SeqPositive.push_back( TempStr + "#"); }; while ( inf_Seq_Negative >> TempName >> TempStr ) { if (!TempStr.empty()) SeqNegative.push_back( TempStr + "#"); }; cout << "Size of the positive database: " << SeqPositive.size() << endl; cout << "Size of the negative database: " << SeqNegative.size() << endl; // ##################### end of input sequences database ##################### // ##################### prefixspan ########################################## ProjectedDatabase EmptyProData; for ( int AAIndex = 0; AAIndex < 20; AAIndex++ ) { ProjectedDatabase ProDataPositive(EmptyProData); ProjectedDatabase ProDataNegative(EmptyProData); ProDataPositive.InitiateProData(AASubType[AAIndex], SeqPositive); ProDataNegative.InitiateProData(AASubType[AAIndex], SeqNegative); cout << "Mining projected database for Prefix " << AASubType[AAIndex] << endl; mining(ProDataPositive, SeqPositive, ProDataNegative, SeqNegative, AASubType, Min_Pat_Length, Min_Sup_Diff, Max_Consective_WC, Max_Ambigu, Min_Evaluate, outfSeq); } cout << "End of this run!" << endl; return 0; } void mining( ProjectedDatabase & ProDataPositive, const vector & SeqPositive, ProjectedDatabase & ProDataNegative, const vector & SeqNegative, const string & AASubType, const int & Min_Pat_Length, const double & Min_Sup_Diff, const int & Max_Consective_WC, const int & Max_Ambigu, const int & Min_Evaluate, ofstream & outfSeq) { double SupportPositive = (double)ProDataPositive.GetSupport()/SeqPositive.size(); double SupportNegative = (double)ProDataNegative.GetSupport()/SeqPositive.size(); if ( (int)ProDataPositive.GetPrefixSize()>= Min_Evaluate && (SupportPositive - SupportNegative < Min_Sup_Diff) ) return; if (ProDataPositive.GetPrefixSize() >= Min_Pat_Length) { vector OutPrefix = ProDataPositive.GetPrefix(); vector OutWildcardNumber = ProDataPositive.GetWildcardNumber(); for (int PrefixIndex = 0; PrefixIndex < (int)(OutPrefix.size()); PrefixIndex++) { outfSeq << OutPrefix[PrefixIndex]; if (PrefixIndex!= (int)(OutPrefix.size()-1)) { if (OutWildcardNumber[PrefixIndex*2] < OutWildcardNumber[PrefixIndex*2+1]) outfSeq << "x(" << OutWildcardNumber[PrefixIndex*2] << "," << OutWildcardNumber[PrefixIndex*2+1] << ")"; else if (OutWildcardNumber[PrefixIndex*2] == OutWildcardNumber[PrefixIndex*2+1]) { for (int WilcardOutIndex = 0; WilcardOutIndex < OutWildcardNumber[PrefixIndex*2]; WilcardOutIndex++) outfSeq << "x"; } else cout << "The second wildcard number is bigger than the first one !!" << endl; } } outfSeq << "\t\t" << SupportPositive << " - " << SupportNegative << " = " << SupportPositive - SupportNegative << endl; } for (int AAIndex = 0; AAIndex < 20; AAIndex++ ) { for (int AmbiguIndex = 0; AmbiguIndex <= Max_Ambigu; AmbiguIndex++) for (int WildcardNumberLowerBoundaryIndex = 0; WildcardNumberLowerBoundaryIndex <= Max_Consective_WC; WildcardNumberLowerBoundaryIndex++) { if (WildcardNumberLowerBoundaryIndex+AmbiguIndex <= Max_Consective_WC) { ProjectedDatabase TempProDataPositive; ProjectedDatabase TempProDataNegative; TempProDataPositive.UpdateProData(ProDataPositive, AASubType[AAIndex], SeqPositive, WildcardNumberLowerBoundaryIndex, WildcardNumberLowerBoundaryIndex+AmbiguIndex); TempProDataNegative.UpdateProData(ProDataNegative, AASubType[AAIndex], SeqNegative, WildcardNumberLowerBoundaryIndex, WildcardNumberLowerBoundaryIndex+AmbiguIndex); mining(TempProDataPositive, SeqPositive, TempProDataNegative, SeqNegative, AASubType, Min_Pat_Length, Min_Sup_Diff, Max_Consective_WC, Max_Ambigu, Min_Evaluate, outfSeq); } } } }