#include #include #include #include #include //#include #include #include #include #include #include "PGTwoI_PerProtein.h" #include "PGTwoI_PD.h" using namespace std; const string AASubType = "GASTCVLIMPFYWDENQHKR"; struct MotifSupport { vector Motif; int Support; }; void mining(ProjectedDatabase & ProData_Positive, const vector & SeqPositive, ProjectedDatabase & ProData_Negative, const vector & SeqNegative, const double & Min_Sup_Diff, const int & Min_Pat_Length, const int & Max_WildCard_Length, const int & Min_Evaluate, vector & OutPut); int main(int argc, char *argv[]) { if (argc != 8) { cout << "Welcome to PGTwoI (mining type I patterns \n" << "by Pattern Growth from two datasets).\n\n" << "7 arguments are required here:\n" << "1. The first argument should be the minimal number of \n" << " non-wildcard residues in the pattern to be reported; (3)\n" << "2. The second argument should be the minimal support difference; (0.8)\n" << "3. The third argument should be the maximal length of\n" << " consective wildcards in the patterns; (3)\n" << "4. The fourth argument should be the minimal length of non-wildcard\n" << " residues in the patterns before evaluating support difference; (3)\n" << "5. The fourth argument should be the pathway and filename\n" << " of positive dataset in fasta format;\n" << "6. The fifth argument should be the pathway and filename\n" << " of negative dataset in fasta format;\n" << "7. The fifth argument should be the pathway and filename\n" << " of the output file." << endl; return 0; } int Min_Pat_Length = atoi(argv[1]); // number of non-wildcard items, AxTxxD is 3 double Min_Sup_Diff = atof(argv[2]); // minimum support difference between positive and negative dataset int Max_WildCard_Length = atoi(argv[3]); // maximal allowed wildcard length, AxTxxxTxxD is 3 int Min_Evaluate = atoi(argv[4]); ifstream inf_Seq_Positive(argv[5]); // input file name if (!inf_Seq_Positive) cout << "Sorry, cannot find the file: " << argv[5] << endl; ifstream inf_Seq_Negative(argv[6]); // input file name if (!inf_Seq_Negative) cout << "Sorry, cannot find the file: " << argv[6] << endl; char OutputFilename[50]; // output file name strcpy(OutputFilename, argv[7]); ofstream outfSeq(OutputFilename); if (!outfSeq) cout << "Sorry, cannot write to the file: " << argv[7] << endl; // ######################### input sequences database ############## vector < string > SeqPositive; vector < string > SeqNegative; string TempName, TempStr; while ( inf_Seq_Positive >> TempName >> TempStr ) { if (!TempStr.empty()) SeqPositive.push_back( TempStr + "#" ); }; while ( inf_Seq_Negative >> TempName >> TempStr ) { if (!TempStr.empty()) SeqNegative.push_back( TempStr + "#" ); }; cout << "Size of the positive database: " << SeqPositive.size() << endl; cout << "Size of the negative database: " << SeqNegative.size() << endl; //system("PAUSE"); // ##################### end of input sequences database ########### // ##################### prefixspan ################################ ProjectedDatabase EmptyProData; vector OutPut; for ( int AAIndex = 0; AAIndex < 20; AAIndex++ ) { ProjectedDatabase ProData_Positive(EmptyProData,false); ProjectedDatabase ProData_Negative(EmptyProData,false); ProData_Positive.InitiateProData(AASubType[AAIndex], SeqPositive); ProData_Negative.InitiateProData(AASubType[AAIndex], SeqNegative); mining(ProData_Positive, SeqPositive, ProData_Negative, SeqNegative, Min_Sup_Diff, Min_Pat_Length, Max_WildCard_Length, Min_Evaluate, OutPut); } for (int OutPutIndex = 0; OutPutIndex < (int)OutPut.size(); OutPutIndex++) { //outfSeq for (int PrefixIndex = 0; PrefixIndex < (int)OutPut[OutPutIndex].Motif.size(); PrefixIndex++) { outfSeq << OutPut[OutPutIndex].Motif[PrefixIndex]; } outfSeq << "\t" << OutPut[OutPutIndex].Support << endl; } //cout << "End of this run!" << endl; return 0; } // parameters are projected database, original input database, // support ratio, minimal non-wildcard items, // maximal length of continuous wildcard, where to output results void mining(ProjectedDatabase & ProData_Positive, const vector & SeqPositive, ProjectedDatabase & ProData_Negative, const vector & SeqNegative, const double & Min_Sup_Diff, const int & Min_Pat_Length, const int & Max_WildCard_Length, const int & Min_Evaluate, vector & OutPut) { if ( ((int)ProData_Positive.GetPrefixSize() >= Min_Evaluate) && ((double)ProData_Positive.GetSupport()/SeqPositive.size() - (double)ProData_Negative.GetSupport()/SeqNegative.size() < Min_Sup_Diff) ) return; // end current projected database if support is low // if support is more than MinSupRatio and non-wildcard items is // more than Min_Pat_Length, output pattern if ((ProData_Positive.GetPrefixSize()-ProData_Positive.GetTotalWildCardLength()) >= Min_Pat_Length) { MotifSupport TempMotifSupport; TempMotifSupport.Motif = ProData_Positive.GetPrefix(); if ( TempMotifSupport.Motif[TempMotifSupport.Motif.size()-1]!= 'x') { TempMotifSupport.Support = ProData_Positive.GetSupport(); OutPut.push_back(TempMotifSupport); } } if (ProData_Positive.GetCurrentWildCardLength() < Max_WildCard_Length) { ProjectedDatabase TempProData_Positive(ProData_Positive,false); TempProData_Positive.MoveAll(); ProjectedDatabase TempProData_Negative(ProData_Negative,false); TempProData_Negative.MoveAll(); mining(TempProData_Positive, SeqPositive, TempProData_Negative, SeqNegative, Min_Sup_Diff, Min_Pat_Length, Max_WildCard_Length, Min_Evaluate, OutPut); } for (int AAIndex = 19; AAIndex >= 0; AAIndex-- ) { ProjectedDatabase TempProData_Positive(ProData_Positive,true); TempProData_Positive.UpdateProData(ProData_Positive, AASubType[AAIndex], SeqPositive); ProjectedDatabase TempProData_Negative(ProData_Negative,true); TempProData_Negative.UpdateProData(ProData_Negative, AASubType[AAIndex], SeqNegative); mining(TempProData_Positive, SeqPositive, TempProData_Negative, SeqNegative, Min_Sup_Diff, Min_Pat_Length, Max_WildCard_Length, Min_Evaluate, OutPut); } }