#include #include #include #include #include #include #include #include #include #include "PGOneI_PerProtein.h" #include "PGOneI_PD.h" using namespace std; const string AASubType = "GASTCVLIMPFYWDENQHKR"; struct MotifSupport { vector Motif; int Support; }; void mining(ProjectedDatabase & TempProData, const vector & SequencesDatabase, const double & MinSupRatio, const int & Min_Pat_Length, const int & Max_WildCard_Length, vector & OutPut); int main(int argc, char *argv[]) { if (argc != 6) { cout << "Welcome to PGOneI (mining type I patterns by Pattern Growth from one dataset).\n\n" << "5 arguments are required here:\n" << "1. The first argument should be the minimal number of \n" << " non-wildcard residues in the pattern to be reported; (3)\n" << "2. The second argument should be the minimal proportion\n" << " of proteins haveing the pattern; (0.8)\n" << "3. The third argument should be the maximal length of\n" << " wildcard in the patterns; (3)\n" << "4. The fourth argument should be the pathway and filename\n" << " of protein sequences in fasta format;\n" << "5. The fifth argument should be the pathway and filename\n" << " of the output file." << endl; return 0; } int Min_Pat_Length = atoi(argv[1]); // number of non-wildcard items, AxTxxD is 3 double Min_Sup_ratio = atof(argv[2]); // ratio of proteins support the pattern in the input int Max_WildCard_Length = atoi(argv[3]); // maximal allowed wildcard length, AxTxxxTxxD is 3 ifstream inf_Seq(argv[4]); // input file name if (!inf_Seq) cout << "Sorry, cannot find the file: " << argv[4] << endl; char OutputFilename[50]; // output file name strcpy(OutputFilename, argv[5]); ofstream outfSeq(OutputFilename); if (!outfSeq) cout << "Sorry, cannot write to the file: " << argv[5] << endl; // ######################### input sequences database ############## vector < string > SequencesDatabase; string TempName, TempStr; while ( inf_Seq >> TempName >> TempStr ) { if (!TempStr.empty()) SequencesDatabase.push_back( TempStr + "#" ); }; cout << "Size of the database: " << SequencesDatabase.size() << endl; // ##################### end of input sequences database ########### // ##################### prefixspan ################################ ProjectedDatabase EmptyProData; vector OutPut; for ( int AAIndex = 0; AAIndex < 20; AAIndex++ ) { ProjectedDatabase ProData(EmptyProData,false); ProData.InitiateProData(AASubType[AAIndex], SequencesDatabase); mining(ProData, SequencesDatabase, Min_Sup_ratio, Min_Pat_Length, Max_WildCard_Length, OutPut); } for (int OutPutIndex = 0; OutPutIndex < (int)OutPut.size(); OutPutIndex++) { for (int PrefixIndex = 0; PrefixIndex < (int)OutPut[OutPutIndex].Motif.size(); PrefixIndex++) { outfSeq << OutPut[OutPutIndex].Motif[PrefixIndex]; } outfSeq << "\t" << OutPut[OutPutIndex].Support << endl; } cout << "End of this run!" << endl; return 0; } // parameters are projected database, original input database, // support ratio, minimal non-wildcard items, // maximal length of continuous wildcard, where to output results void mining(ProjectedDatabase & TempProData, const vector & SequencesDatabase, const double & MinSupRatio, const int & Min_Pat_Length, const int & Max_WildCard_Length, vector & OutPut) { if ( TempProData.GetSupport() < SequencesDatabase.size() * MinSupRatio ) return; // end current projected database if support is low // if support is more than MinSupRatio and non-wildcard items is // more than Min_Pat_Length, output pattern if ((TempProData.GetPrefixSize()-TempProData.GetTotalWildCardLength()) >= Min_Pat_Length) { MotifSupport TempMotifSupport; TempMotifSupport.Motif = TempProData.GetPrefix(); if ( TempMotifSupport.Motif[TempMotifSupport.Motif.size()-1]!= 'x') { TempMotifSupport.Support = TempProData.GetSupport(); OutPut.push_back(TempMotifSupport); } } if (TempProData.GetCurrentWildCardLength() < Max_WildCard_Length) { ProjectedDatabase ProData(TempProData,false); ProData.MoveAll(); mining(ProData, SequencesDatabase, MinSupRatio, Min_Pat_Length, Max_WildCard_Length, OutPut); } for (int AAIndex = 19; AAIndex >= 0; AAIndex-- ) { ProjectedDatabase ProData(TempProData,true); ProData.UpdateProData(TempProData, AASubType[AAIndex], SequencesDatabase); mining(ProData, SequencesDatabase, MinSupRatio, Min_Pat_Length, Max_WildCard_Length, OutPut); } }