#include #include #include #include #include #include #include #include #include #include #include "PGOneIII_PerProtein.h" #include "PGOneIII_PD.h" using namespace std; void mining(ProjectedDatabase & TempProData, const vector & SequencesDatabase, const string & AASubType, const double & MinSupRatio, const int & Min_Pat_Length, ofstream & outfSeq); int main(int argc, char *argv[]) { if (argc != 6) { cout << "Welcome to PGOneIII (mining type III patterns by pattern growth from one dataset.\n\n" << "5 arguments are required:\n" << "1. The first argument should be the minimal number \n" << " of frequent items in one pattern should be reported; (3)\n" << "2. The second argument should be the minimal proportion\n" << " of proteins have the pattern; (0.8)\n" << "3. The third argument should be the length of window to search(20)\n" << "4. The forth argument should be the pathway and filename\n" << " of protein sequences in fasta format;\n" << "5. The fifth argument should be the pathway and filename\n" << " of output file." << endl; return 0; } int Min_Pat_Length = atoi(argv[1]); double Min_Sup_ratio = atof(argv[2]); int Window = atoi(argv[3]); ifstream inf_Seq(argv[4]); if (!inf_Seq) cout << "Sorry, cannot find the file: " << argv[4] << endl; char OutputFilename[50]; strcpy(OutputFilename, argv[5]); ofstream outfSeq(OutputFilename); if (!outfSeq) cout << "Sorry, cannot write to the file: " << argv[5] << endl; const string AASubType = "GASTCVLIMPFYWDENQHKR"; // ######################### input sequences database ######################## vector < string > SequencesDatabase; string TempName, TempStr; while ( inf_Seq >> TempName >> TempStr ) { if (!TempStr.empty()) SequencesDatabase.push_back( TempStr + "#"); }; cout << "Size of the database: " << SequencesDatabase.size() << endl; // ##################### end of input sequences database ##################### // ##################### prefixspan ########################################## ProjectedDatabase EmptyProData; for ( int AAIndex = 0; AAIndex < 20; AAIndex++ ) { ProjectedDatabase ProData(EmptyProData); ProData.InitiateProData(AASubType[AAIndex], SequencesDatabase, Window); cout << "Mining projected database for Prefix " << AASubType[AAIndex] << endl; mining(ProData, SequencesDatabase, AASubType, Min_Sup_ratio, Min_Pat_Length, outfSeq); } cout << "End of this run!" << endl; return 0; } void mining( ProjectedDatabase & TempProData, const vector & SequencesDatabase, const string & AASubType, const double & MinSupRatio, const int & Min_Pat_Length, ofstream & outfSeq) { if ( TempProData.GetSupport() < SequencesDatabase.size() * MinSupRatio ) return; if (TempProData.GetPrefixSize() >= Min_Pat_Length) { vector OutPrefix = TempProData.GetPrefix(); for (int PrefixIndex = 0; PrefixIndex < (int)(OutPrefix.size()); PrefixIndex++) { outfSeq << OutPrefix[PrefixIndex]; if (PrefixIndex!= (int)(OutPrefix.size()-1)) outfSeq << "->"; } outfSeq << "\t" << TempProData.GetSupport() << endl; } for (int AAIndex = 0; AAIndex < 20; AAIndex++ ) { ProjectedDatabase ProData(TempProData); ProData.UpdateProData(AASubType[AAIndex], SequencesDatabase); mining(ProData, SequencesDatabase, AASubType, MinSupRatio, Min_Pat_Length, outfSeq); } }