#include #include #include #include #include #include #include #include #include #include #include "PGTwoIII_PerProtein.h" #include "PGTwoIII_PD.h" using namespace std; void mining(ProjectedDatabase & PositiveDataset, const vector & SeqPositive, ProjectedDatabase & NegativeDataset, const vector & SeqNegative, const string & AASubType, const double & Min_Support_Diff, const int & Min_Pat_Length, const int & Min_Evaluate, ofstream & outfSeq); int main(int argc, char *argv[]) { if (argc != 8) { cout << "Welcome to PGOneIII (mining type III patterns by pattern growth from one dataset.\n\n" << "7 arguments are required:\n" << "1. The first argument should be the minimal number \n" << " of frequent items in one pattern to be reported; (3)\n" << "2. The second argument should be the minimal support difference; (0.8)\n" << "3. The third argument should be the length of window to search; (10)\n" << "4. The fourth argument should be the minimal number of frequent\n" << " items in the patterns before evaluating support difference; (3)\n" << "5. The forth argument should be the pathway and filename\n" << " of the positive dataset in fasta format;\n" << "6. The fifth argument should be the pathway and filename\n" << " of the negative dataset in fasta format;\n" << "7. The sixth argument should be the pathway and filename\n" << " of output file." << endl; return 0; } int Min_Pat_Length = atoi(argv[1]); double Min_Support_Diff = atof(argv[2]); int Window = atoi(argv[3]); int Min_Evaluate = atoi(argv[4]); ifstream inf_Seq_Positive(argv[5]); if (!inf_Seq_Positive) cout << "Sorry, cannot find the file: " << argv[5] << endl; ifstream inf_Seq_Negative(argv[6]); if (!inf_Seq_Negative) cout << "Sorry, cannot find the file: " << argv[6] << endl; char OutputFilename[50]; strcpy(OutputFilename, argv[7]); ofstream outfSeq(OutputFilename); if (!outfSeq) cout << "Sorry, cannot write to the file: " << argv[7] << endl; const string AASubType = "GASTCVLIMPFYWDENQHKR"; // ######################### input sequences database ######################## vector < string > SeqPositive; vector < string > SeqNegative; string TempName, TempStr; while ( inf_Seq_Positive >> TempName >> TempStr ) { if (!TempStr.empty()) SeqPositive.push_back( TempStr + "#"); }; while ( inf_Seq_Negative >> TempName >> TempStr ) { if (!TempStr.empty()) SeqNegative.push_back( TempStr + "#"); }; cout << "Size of the positive database: " << SeqPositive.size() << endl; cout << "Size of the negative database: " << SeqNegative.size() << endl; // ##################### end of input sequences database ##################### // ##################### prefixspan ########################################## ProjectedDatabase EmptyProData; for ( int AAIndex = 0; AAIndex < 20; AAIndex++ ){ ProjectedDatabase ProDataPositive(EmptyProData); ProjectedDatabase ProDataNegative(EmptyProData); //cout << "Initiate projected database for " << AASubType[AAIndex] << endl; ProDataPositive.InitiateProData(AASubType[AAIndex], SeqPositive, Window); ProDataNegative.InitiateProData(AASubType[AAIndex], SeqNegative, Window); cout << "Mining projected database for Prefix " << AASubType[AAIndex] << endl; mining(ProDataPositive, SeqPositive, ProDataNegative, SeqNegative, AASubType, Min_Support_Diff, Min_Pat_Length, Min_Evaluate, outfSeq); } cout << "End of this run!" << endl; return 0; } void mining(ProjectedDatabase & PositiveDataset, const vector & SeqPositive, ProjectedDatabase & NegativeDataset, const vector & SeqNegative, const string & AASubType, const double & Min_Support_Diff, const int & Min_Pat_Length, const int & Min_Evaluate, ofstream & outfSeq) { double SupportPositive = (double)PositiveDataset.GetSupport() / SeqPositive.size(); double SupportNegative = (double)NegativeDataset.GetSupport() / SeqNegative.size(); if ( PositiveDataset.GetPrefixSize() >= Min_Evaluate && SupportPositive - SupportNegative < Min_Support_Diff ) return; if (PositiveDataset.GetPrefixSize() >= Min_Pat_Length) { vector OutPrefix = PositiveDataset.GetPrefix(); for (int PrefixIndex = 0; PrefixIndex < (int)(OutPrefix.size()); PrefixIndex++) { outfSeq << OutPrefix[PrefixIndex]; if (PrefixIndex!= (int)(OutPrefix.size()-1)) outfSeq << "->"; } outfSeq << "\t" << SupportPositive << " - " << SupportNegative << " = " << SupportPositive - SupportNegative << endl; } for (int AAIndex = 0; AAIndex < 20; AAIndex++ ) { ProjectedDatabase TempProDataPositive(PositiveDataset); ProjectedDatabase TempProDataNegative(NegativeDataset); TempProDataPositive.UpdateProData(AASubType[AAIndex], SeqPositive); TempProDataNegative.UpdateProData(AASubType[AAIndex], SeqNegative); mining(TempProDataPositive, SeqPositive, TempProDataNegative, SeqNegative, AASubType, Min_Support_Diff, Min_Pat_Length, Min_Evaluate, outfSeq); } }