// // C++ program that finds frequent itemsets - file current.cc // April 23, 2004 // Wim Pijls and Walter Kosters // Erasmus University Rotterdam and Leiden University, The Netherlands // pijls@few.eur.nl and kosters@liacs.nl // http://www.liacs.nl/home/kosters/df/ // // makefile: g++ -Wall -O3 -o fim_all current.cc // // Assume that the FP-database and the trie containing all frequent // sets together fit in main memory // // The data is read four times: // 1. to find minimal and maximal item number that occurs // 2. to determine the frequencies of the single item numbers // 3. to find the relevant transactions (containing at least 2 frequent items) // 4. to store the (relevant) database in main memory // // Several (time) printing commands are commented out with // // // This version (current.cc) replaces previous versions dfmemory.cc // and dftime.cc; main difference: memory management is improved // in order to avoid unnecessary allocations, which makes different // memory/time efficient versions obsolete. // Runtimes are comparable with those of its predecessors // Difference with dffast.cc: FP-trees for database; two types distinguished // (int and unsigned short) during runtime using templates // // name inputfile: #define input_filename argv[1] // (absolute) value of minsup: #define macro_minsup atoi(argv[2]) // name outputfile: #define output_filename argv[3] #include #include #include #include #include #include using namespace std; const int MAXDEPTH = 100; // maximal depth of trie (for printing) int minsup; // minimum support //======================================================================== // // DATABASE CONSTRUCTION class initialcounts { public: initialcounts (char *inputdata); static int *itemsorder; static int *items_frequency; static int *ranking; static int min_itemnr, max_itemnr; static int number_transactions, number_freq_items, lines, last; private: void first_pass ( ); void second_pass ( ); void third_pass ( ); void initialsort ( ); char *infilename; int *init_items_frequency; }; int *initialcounts::items_frequency = NULL; int *initialcounts::ranking = NULL; int *initialcounts::itemsorder = NULL; int initialcounts::number_transactions = 0; int initialcounts::lines = 0; int initialcounts::number_freq_items = 0; int initialcounts::min_itemnr = 0; int initialcounts::max_itemnr = 0; int initialcounts::last = 0; // constructor initialcounts::initialcounts (char *inputdata) { infilename = inputdata; first_pass ( ); second_pass ( ); third_pass ( ); initialsort ( ); }//initialcounts::initialcounts // computes minimal and maximal item number that occur in the database; // if these are known in advance, this function can be easily adapted // function reads whole file! void initialcounts::first_pass ( ) { int itemnr; bool first = true; ifstream fin (infilename); if ( ! fin ) cout << "No such filename" << endl; char c; int pos; do { do { fin.get (c); itemnr = 0; pos = 0; while ( ( c >= '0' ) && ( c <= '9' ) && ! fin.eof ( ) ) { itemnr = 10*itemnr + (int)(c) - (int)('0'); pos++; fin.get (c); }//while if ( pos ) { if ( first ) max_itemnr = min_itemnr = itemnr; first = false; if ( itemnr < min_itemnr ) min_itemnr = itemnr; else if ( itemnr > max_itemnr ) max_itemnr = itemnr; }//if } while ( c != '\n' && ! fin.eof ( ) ); } while ( ! fin.eof ( ) ); fin.close ( ); }//initialcounts::first_pass // determines frequency for all items, and number of frequent items // function reads whole file! void initialcounts::second_pass ( ) { int k; ifstream fin (infilename); if ( ! fin ) cout << "No such filename" << endl; int itemrange = max_itemnr-min_itemnr+1; init_items_frequency = new int[itemrange]; for ( k = 0; k < itemrange; k++ ) init_items_frequency[k] = 0; char c; int item, pos; do { do { fin.get (c); item = 0; pos = 0; while ( ( c >= '0' ) && ( c <= '9' ) && ! fin.eof ( ) ) { item = 10*item + (int)(c) - (int)('0'); pos++; fin.get (c); }//while if ( pos ) init_items_frequency[item-min_itemnr]++; } while ( c != '\n' && ! fin.eof ( ) ); } while ( ! fin.eof ( ) ); fin.close ( ); for ( k = 0; k < itemrange; k++ ) if ( init_items_frequency[k] >= minsup ) number_freq_items++; printf ("Number of frequent items: %d\n", number_freq_items); }//initialcounts::second_pass // determines number of relevant transactions, i.e., those // that contain at least two frequent items // (those that contain one frequent item have already been accounted for // while determining the frequency of the single items) // 20.2.2007: changed to all transactions that contain some frequent item // the variable is never used in the program // function reads whole file! void initialcounts::third_pass ( ) { number_transactions = 0; ifstream fin (infilename); if ( ! fin ) cout << "No such filename" << endl; char c; int item, pos, items_in_trans; bool line; do { line = false; items_in_trans = 0; do { fin.get (c); item = 0; pos = 0; while ( ( c >= '0' ) && ( c <= '9' ) && ! fin.eof ( ) ) { item = 10*item + (int)(c) - (int)('0'); pos++; line = true; fin.get (c); }//while if ( pos && init_items_frequency[item-min_itemnr] >= minsup ) items_in_trans++; } while ( c != '\n' && ! fin.eof ( ) ); if ( line ) lines++; if ( items_in_trans >= 1 ) number_transactions++; } while ( ! fin.eof ( ) ); printf ("Number of relevant transactions: %d\n", number_transactions); printf ("Number of lines: %d\n", lines); fin.close ( ); }//initialcounts::third_pass // sort items with respect to support - and renumber void initialcounts::initialsort ( ) { int left_cursor = 0; int right_cursor = max_itemnr-min_itemnr; int itemrange = right_cursor+1; int *items_numbers; int i, j, k; items_numbers = new int[number_freq_items]; for ( k = 0; k < number_freq_items; k++ ) items_numbers[k] = k; while ( left_cursor < right_cursor ) { while ( init_items_frequency[left_cursor] >= minsup && left_cursor= 0; rank-- ) { maxfrequency = -1; for ( j = 0; j < number_freq_items; j++ ) if ( used[j] == 0 && init_items_frequency[j] > maxfrequency ) { bestindex = j; bestitem = items_numbers[j]; maxfrequency = init_items_frequency[j]; }//if itemsorder[rank] = bestitem+min_itemnr; ranking[bestitem] = rank; items_frequency[rank] = maxfrequency; used[bestindex] = 1; }//for }//initialcounts::initialsort //========================================================================= // // FP BUILDING template struct FPtreenode { unsigned short info; T count; unsigned short mark; // unsigned short length; // unsigned short depth; FPtreenode *child; FPtreenode *brother; FPtreenode *nextsame; FPtreenode *father; }; template class FP { public: FPtreenode *FProot; FPtreenode **layer; FP ( ) { FProot = NULL; } void buildFP (char *infilename); private: void updateFPtree (bool *next_transaction); }; // push next_transaction into FP-tree template void FP::updateFPtree (bool *next_transaction) { int art; // unsigned short localdepth = 0; FPtreenode *ptr = FProot; FPtreenode *kid; FPtreenode *prev = NULL; for ( art = initialcounts::number_freq_items-1; art >= 0; art-- ) if ( next_transaction[art] ) { kid = ptr->child; // localdepth++; if ( kid ) { while ( kid && kid->info != art ) { prev = kid; kid = kid->brother; }//while if ( kid ) { kid->count++; ptr = kid; }//if else { prev->brother = new FPtreenode; if ( ptr == FProot ) prev->brother->father = NULL; else prev->brother->father = ptr; ptr = prev->brother; ptr->count = 1; // ptr->depth = localdepth; ptr->mark = ( ptr->info = art ) + 1; ptr->child = ptr->brother = NULL; ptr->nextsame = layer[art]; layer[art] = ptr; }//else }//if else { ptr->child = new FPtreenode; if ( ptr == FProot ) ptr->child->father = NULL; else ptr->child->father = ptr; ptr = ptr->child; ptr->count = 1; // ptr->depth = localdepth; ptr->mark = ( ptr->info = art ) + 1; ptr->child = ptr->brother = NULL; ptr->nextsame = layer[art]; layer[art] = ptr; }//else }//if }//FP::updateFPtree // reads the entire datafile from file inputdata and // puts it into an FP-tree // reads whole file (for the fourth time)! template void FP::buildFP (char *infilename) { int pos, item, items_count, i; unsigned short newitemnr; char c; bool *next_transaction = new bool[initialcounts::number_freq_items]; layer = new FPtreenode*[initialcounts::number_freq_items]; for ( i = 0; i < initialcounts::number_freq_items; i++ ) layer[i] = NULL; FProot = new FPtreenode; FProot->info = 0; FProot->child = FProot->brother = FProot->father = FProot->nextsame = NULL; FProot->count = 0; // FProot->depth = 0; ifstream fin (infilename); if ( ! fin ) cout << "No such filename" << endl; do { for ( int column = 0; column < initialcounts::number_freq_items; column++ ) next_transaction[column] = false; items_count = 0; do { fin.get (c); item = 0; pos = 0; while ( ( c >= '0' ) && ( c <= '9' ) && ! fin.eof ( ) ) { item = 10*item + (int)(c) -(int)('0'); pos++; fin.get (c); }//while if ( pos && initialcounts::ranking[item-initialcounts::min_itemnr] >= 0 ) { newitemnr = initialcounts::ranking[item-initialcounts::min_itemnr]; next_transaction[newitemnr] = true; items_count++; }//if } while ( c != '\n' && ! fin.eof ( ) ); if ( items_count >= 1 ) // perhaps 2, but does it really matter? updateFPtree (next_transaction); } while( ! fin.eof ( ) ); fin.close ( ); delete [ ] next_transaction; }//FP::buildFP //========================================================================= // // COUNTING template struct bucket { unsigned short itemvalue; T count; T aux; unsigned short number_followers; bucket *next; }; template class trie { public: trie ( ) { }; trie (initialcounts & initialdata); void build_up (FP & theoriginaltree); void printtrie (char *outputdata); private: int length_count[MAXDEPTH]; bool *frequent; void fpcount3 (unsigned short *nodes, bucket *trienode, unsigned short number_buckets); void makeaux0 (bucket *root, unsigned short number); FILE *outfilename; struct bucket *root; unsigned short triesize; unsigned short k; T thevalue; unsigned short *thearraypointer; int results[MAXDEPTH]; unsigned short *follows; struct bucket **roots; void copying (bucket *p, bucket *q, unsigned short number_q_buckets); void printout (int depth, bucket *trienode, unsigned short number_buckets); void dotheFPtree (FPtreenode *itsroot); }; // constructor template trie::trie (initialcounts & initialdata) { triesize = initialcounts::number_freq_items; root = new bucket[triesize]; frequent = new bool[triesize]; thearraypointer = new unsigned short[triesize+1]; *thearraypointer = 0; thearraypointer++; follows = new unsigned short[triesize]; roots = new bucket*[triesize]; for ( unsigned short itemnr = 0; itemnr < triesize; itemnr++ ) { root[itemnr].count = 666; // to avoid problems with short's; // this particular count-field is never used anymore (I hope) root[itemnr].itemvalue = itemnr; root[itemnr].number_followers = 0; roots[itemnr] = root[itemnr].next = NULL; follows[itemnr] = 0; }//for }//trie::trie // do the FP-counting, already knowing the relevant frequent 2-itemsets // now the deeper levels // number_buckets > 0, nodes contains a sentinel 0 at the start template void trie::fpcount3 (unsigned short *nodes, bucket *trienode, unsigned short number_buckets) { unsigned short x = trienode->itemvalue; do { while ( *nodes != x ) if ( *nodes > x ) { trienode++; if ( ! ( --number_buckets ) ) return; x = trienode->itemvalue; }//if else if ( ! *--nodes ) return; trienode->aux += thevalue; if ( ! *--nodes ) return; if ( trienode->number_followers ) fpcount3 (nodes, trienode->next, trienode->number_followers); trienode++; if ( ! ( --number_buckets ) ) return; x = trienode->itemvalue; } while ( true ); }//trie::fpcount3 // traverse the FP-tree template void trie::dotheFPtree (FPtreenode *itsroot) { itsroot = itsroot->child; while ( itsroot ) { if ( itsroot->mark == k ) { if ( frequent[*thearraypointer = itsroot->info] ) { if ( follows[*thearraypointer] ) { thevalue = itsroot->count; fpcount3 (thearraypointer, roots[*thearraypointer], follows[*thearraypointer]); }//if thearraypointer++; dotheFPtree (itsroot); thearraypointer--; }//if else dotheFPtree (itsroot); }//if itsroot = itsroot->brother; }//while }//trie::dotheFPtree // build trie out of FP-transactions template void trie::build_up (FP & theoriginaltree) { FPtreenode *goingup; FPtreenode *globpointer; T *counttwoitemsets = new T[triesize]; int cnt, i, suppo; k = triesize - 2; while ( triesize > 1 ) { globpointer = theoriginaltree.layer[k]; for ( i = k+1; i < triesize; i++ ) counttwoitemsets[i] = 0; while ( goingup = globpointer ) // note single =! { suppo = globpointer->count; while ( ( goingup = goingup->father ) && goingup->mark > k ) { counttwoitemsets[goingup->info] += suppo; goingup->count = suppo; goingup->mark = k; }//while while ( goingup ) { counttwoitemsets[goingup->info] += suppo; goingup->count += suppo; goingup = goingup->father; }//while globpointer = globpointer->nextsame; }//while cnt = 0; for ( i = triesize-1; i >= k+1; i-- ) // in this order! if ( counttwoitemsets[i] < minsup ) { root[i].aux = 0; frequent[i] = false; }//if else { root[i].aux = counttwoitemsets[i]; frequent[i] = true; makeaux0 (roots[i], follows[i]); cnt++; }//else if ( cnt ) { dotheFPtree (theoriginaltree.FProot); roots[k] = root[k].next = new bucket[cnt]; follows[k] = root[k].number_followers = cnt; copying (roots[k], root+k+1, triesize-k-1); }//if if ( k == 0 ) break; k--; }//while }//trie::build_up // make all necessary aux-fields 0 template void trie::makeaux0 (bucket *root, unsigned short number) { for ( int j = 0; j < number; j++ ) { root->aux = 0; if ( root->number_followers && frequent[root->itemvalue] ) makeaux0 (root->next, root->number_followers); root++; }//for }//trie::makeaux0 // copy trie structure from q to p template void trie::copying (bucket * p, bucket *q, unsigned short number_q_buckets) { short temp; // how many buckets does p need? short i; for ( int source = 0; source < number_q_buckets; source++ ) { if ( q->aux >= minsup ) { p->count = q->aux; p->itemvalue = q->itemvalue; temp = 0; for ( i = q->number_followers-1; i >= 0; i-- ) if ( q->next[i].aux >= minsup ) temp++; p->number_followers = temp; if ( temp > 0 ) { p->next = new bucket[temp]; copying (p->next, q->next, q->number_followers); }//if else p->next = NULL; p++; }//if q++; }//for }//trie::copying // print resulting trie and frequency of each pattern length template void trie::printtrie (char *outputdata) { int k; for ( k = 0; k < MAXDEPTH; k++ ) length_count[k] = 0; outfilename = fopen (outputdata, "w"); fprintf (outfilename, "(%d)\n", initialcounts::lines); // empty set printout (1, root, triesize); int lpl; for ( lpl = MAXDEPTH-1; lpl >= 0 && length_count[lpl] == 0; lpl-- ) ; if ( initialcounts::lines >= minsup ) printf ("1\n"); for ( k = 1; k <= lpl; k++ ) printf ("%d\n", length_count[k]); fclose(outfilename); }//trie::printtrie // do the printing template void trie::printout (int depth, bucket *trienode, unsigned short number_buckets) { for ( int i = 0; i < number_buckets; i++ ) { results[depth] = trienode[i].itemvalue; length_count[depth]++; for ( int j = 1; j <= depth; j++ ) fprintf (outfilename, "%d ", initialcounts::itemsorder[results[j]]); if ( depth > 1 ) fprintf (outfilename, "(%d)\n", trienode[i].count); else fprintf (outfilename, "(%d)\n", initialcounts::items_frequency[i]); if ( trienode[i].number_followers > 0 ) printout (depth+1, trienode[i].next, trienode[i].number_followers); }//for }//trie::printout // main program int main (int argc, char *argv[ ]) { long int time1, time2, c; minsup = macro_minsup; if ( minsup <= 0 ) minsup = 1; time1 = time (&c); printf ("\nReading input starts ...\n"); initialcounts initialdata (input_filename); time2 = time (&c); printf ("Execution time - reading: %ld\n", time2-time1); if ( initialcounts::items_frequency[initialcounts::number_freq_items-1] >= USHRT_MAX ) { time1 = time (&c); printf ("Building (big) FP-tree starts ...\n"); FP theFPtree; theFPtree.buildFP (input_filename); printf ("FP-tree completed\n"); time2 = time (&c); printf ("Execution time - FP-phase: %ld\n", time2-time1); time1 = time (&c); printf ("Counting (big) starts ...\n"); trie ourbigtrie (initialdata); ourbigtrie.build_up (theFPtree); time2 = time (&c); printf ("Execution time - counting: %ld\n\n", time2-time1); if ( argc > 3 ) ourbigtrie.printtrie (output_filename); }//if else { time1 = time (&c); printf ("Building (small) FP-tree starts ...\n"); FP theFPtree; theFPtree.buildFP (input_filename); printf ("FP-tree completed\n"); time2 = time (&c); printf ("Execution time - FP-phase: %ld\n", time2-time1); time1 = time (&c); printf ("Counting (small) starts ...\n"); trie oursmalltrie (initialdata); oursmalltrie.build_up (theFPtree); time2 = time (&c); printf ("Execution time - counting: %ld\n\n", time2-time1); if ( argc > 3 ) oursmalltrie.printtrie (output_filename); }//else return 0; }//main