// // dfmemory.cc memory efficient version // C++ program that finds frequent itemsets - file dfmemory.cc // September 26, 2003 // November 17, 2003: changed unsigned short count into int count // Wim Pijls and Walter Kosters // Erasmus University Rotterdam and Leiden University, The Netherlands // pijls@few.eur.nl and kosters@liacs.nl // http://www.liacs.nl/home/kosters/df/ // // makefile: g++ -Wall -O3 -o fim_all dfmemory.cc // // Assume that the relevant database and the trie containing all frequent // sets together fit in main memory; relevant here means: all transactions // that contain at least 2 frequent items (the so-called relevant // transactions), whereas only frequent items are considered. Every byte // contains 8 bits of the database. So its size is the number of // frequent items times the number of relevant transactions divided by 8. // // The data is read four times: // 1. to find minimal and maximal item number that occurs // 2. to determine the frequencies of the single item numbers // 3. to find the relevant transactions (containing at least 2 frequent items) // 4. to store the (relevant) database in main memory // // name inputfile: #define input_filename argv[1] // (absolute) value of minsup: #define macro_minsup atoi(argv[2]) // name outputfile: #define output_filename argv[3] #include #include #include #include #include using namespace std; typedef char *transaction; const int MAXDEPTH = 100; // maximal depth of trie (for printing) int minsup; // minimum support int therow; // number of currect transaction transaction globpointer; // current transaction transaction *dataset; // the whole database char vector[8]; // masks for fast char *supervector; // array addressing class initialcounts { public: initialcounts (char *inputdata); static int *itemsorder; static int *items_frequency; static short *ranking; static int min_itemnr, max_itemnr; static int number_transactions, number_freq_items, lines; private: void first_pass ( ); void second_pass ( ); void third_pass ( ); void initialsort ( ); char *infilename; int *init_items_frequency; }; class data_array { public: data_array (char *inputdata); private: char *next_transaction; void inputread (char *inputdata); }; struct bucket { short itemvalue; int count; // used to be unsigned short count; short number_followers; struct bucket *next; }; void count (bucket *trienode, short number_buckets); // does currect transaction (globpointer) contain item "column"? inline int inspect (int column) { return ( ( globpointer[column >> 3] & supervector[column] ) ); }//inspect class trie { public: trie ( ) { }; trie (initialcounts & initialdata, data_array & datagrid); void build_up ( ); void printtrie (char *outputdata); private: int length_count[MAXDEPTH]; void extend (int k); void cleanup (bucket * & trienode, short & number); FILE *outfilename; struct bucket *root; int triesize; int results[MAXDEPTH]; void copying (struct bucket *p, struct bucket *q, int number_q_buckets); void printout (int depth, struct bucket *trienode, int number_buckets); }; int *initialcounts::items_frequency = NULL; short *initialcounts::ranking = NULL; int *initialcounts::itemsorder = NULL; int initialcounts::number_transactions = 0; int initialcounts::lines = 0; int initialcounts::number_freq_items = 0; int initialcounts::min_itemnr = 0; int initialcounts::max_itemnr = 0; // constructor initialcounts::initialcounts (char *inputdata) { infilename = inputdata; first_pass ( ); second_pass ( ); third_pass ( ); initialsort ( ); }//initialcounts::initialcounts // computes minimal and maximal item number that occur in the database; // if these are known in advance, this function can be easily adapted // function reads whole file! void initialcounts::first_pass ( ) { int itemnr; bool first = true; ifstream fin (infilename); if ( ! fin ) cout << "No such filename" << endl; char c; int pos; do { do { fin.get (c); itemnr = 0; pos = 0; while ( ( c >= '0' ) && ( c <= '9' ) && ! fin.eof ( ) ) { itemnr = 10*itemnr + (int)(c) - (int)('0'); pos++; fin.get (c); }//while if ( pos ) { if ( first ) max_itemnr = min_itemnr = itemnr; first = false; if ( itemnr < min_itemnr ) min_itemnr = itemnr; else if ( itemnr > max_itemnr ) max_itemnr = itemnr; }//if } while ( c != '\n' && ! fin.eof ( ) ); } while ( ! fin.eof ( ) ); fin.close ( ); }//initialcounts::first_pass // determines frequency for all items, and number of frequent items // function reads whole file! void initialcounts::second_pass ( ) { int k; ifstream fin (infilename); if ( ! fin ) cout << "No such filename" << endl; int itemrange = max_itemnr-min_itemnr+1; init_items_frequency = new int[itemrange]; for ( k = 0; k < itemrange; k++ ) init_items_frequency[k] = 0; char c; int item, pos; do { do { fin.get (c); item = 0; pos = 0; while ( ( c >= '0' ) && ( c <= '9' ) && ! fin.eof ( ) ) { item = 10*item + (int)(c) - (int)('0'); pos++; fin.get (c); }//while if ( pos ) init_items_frequency[item-min_itemnr]++; } while ( c != '\n' && ! fin.eof ( ) ); } while ( ! fin.eof ( ) ); fin.close ( ); for ( k = 0; k < itemrange; k++ ) if ( init_items_frequency[k] >= minsup ) number_freq_items++; // printf ("Number of frequent items: %d\n", number_freq_items); }//initialcounts::second_pass // determines number of relevant transactions, i.e., those // that contain at least two frequent items // (those that contain one frequent item have already been accounted for // while determining the frequency of the single items) // function reads whole file! void initialcounts::third_pass ( ) { number_transactions = 0; ifstream fin (infilename); if ( ! fin) cout << "No such filename" << endl; char c; int item, pos, items_in_trans; bool line; do { line = false; items_in_trans = 0; do { fin.get (c); item=0; pos=0; while ( ( c >= '0' ) && ( c <= '9' ) && ! fin.eof ( ) ) { item = 10*item + (int)(c) - (int)('0'); pos++; line = true; fin.get (c); }//while if ( pos && init_items_frequency[item-min_itemnr] >= minsup ) items_in_trans++; } while ( c != '\n' && ! fin.eof ( ) ); if ( line ) lines++; if ( items_in_trans >= 2 ) number_transactions++; } while ( ! fin.eof ( ) ); // printf ("Number of relevant transactions: %d\n", number_transactions); // printf ("Number of lines: %d\n", lines); fin.close ( ); }//initialcounts::third_pass // sort items with respect to support - and renumber void initialcounts::initialsort ( ) { int left_cursor = 0; int right_cursor = max_itemnr-min_itemnr; int itemrange = right_cursor+1; int *items_numbers; int i, j, k; items_numbers = new int[number_freq_items]; for ( k = 0; k < number_freq_items; k++ ) items_numbers[k] = k; while ( left_cursor < right_cursor ) { while ( init_items_frequency[left_cursor] >= minsup && left_cursor= 0; rank-- ) { maxfrequency = -1; for ( j = 0; j < number_freq_items; j++ ) if ( used[j] == 0 && init_items_frequency[j] > maxfrequency ) { bestindex = j; bestitem = items_numbers[j]; maxfrequency = init_items_frequency[j]; }//if itemsorder[rank] = bestitem+min_itemnr; ranking[bestitem] = rank; items_frequency[rank] = maxfrequency; used[bestindex] = 1; }//for }//initialcounts::initialsort // constructor data_array::data_array (char *inputdata) { int v = 1; vector[0] = (char) v; for ( int k = 1; k < 8; k++ ) { v = 2*v; vector[k] = (char) v; }//for supervector = new char[initialcounts::number_freq_items]; for ( int col = 0; col < initialcounts::number_freq_items; col++ ) supervector[col] = vector[col & 7]; dataset = new transaction[initialcounts::number_transactions]; inputread (inputdata); }//data_array::data_array // reads the entire datafile from file inputdata and // puts in into a 2-dimensional character array (eight 0/1's per byte) // reads whole file (for the fourth time)! void data_array::inputread (char *inputdata) { int pos, item, items_count; short newitemnr; char c; ifstream fin (inputdata); if ( ! fin ) cout << "No such filename" << endl; int arraywidth = (int)((initialcounts::number_freq_items-1)/8) + 1; int rowcounter = 0; do { next_transaction = new char[arraywidth]; for ( int column = 0; column < arraywidth; column++ ) next_transaction[column] = '\000'; items_count = 0; do { fin.get (c); item = 0; pos = 0; while ( ( c >= '0' ) && ( c <= '9' ) && ! fin.eof ( ) ) { item = 10*item + (int)(c) -(int)('0'); pos++; fin.get (c); }//while if ( pos && initialcounts::ranking[item-initialcounts::min_itemnr] >= 0 ) { newitemnr = initialcounts::ranking[item-initialcounts::min_itemnr]; next_transaction[newitemnr >> 3] |= supervector[newitemnr]; items_count++; }//if } while ( c != '\n' && ! fin.eof ( ) ); if ( items_count >= 2 ) dataset[rowcounter++] = next_transaction; else delete [ ] next_transaction; } while( ! fin.eof ( ) ); fin.close ( ); }//data_array::inputread // constructor trie::trie (initialcounts & initialdata, data_array & datagrid) { triesize = initialcounts::number_freq_items; root = (bucket*) calloc (triesize, sizeof (bucket)); for ( int itemnr = 0; itemnr < triesize; itemnr++ ) { root[itemnr].count = minsup; // to avoid problems with short's root[itemnr].itemvalue = itemnr; root[itemnr].number_followers = 0; root[itemnr].next = NULL; }//for }//trie::trie // build trie out of transactions void trie::build_up ( ) { int upperbound = initialcounts::number_freq_items-2; bucket *root2; short numberfol; for ( int k = upperbound; k >= 0; k-- ) { extend (k); root2 = root[k].next; numberfol = root[k].number_followers; for ( therow = 0; therow < initialcounts::number_transactions; therow++ ) { globpointer = dataset[therow]; if ( inspect (k) ) count (root2, numberfol); }//for if ( numberfol > 0 ) cleanup (root[k].next, root[k].number_followers); }//for }//trie::build_up // extend trie at position k void trie::extend (int k) { root[k].next = (bucket*) calloc (triesize-k-1, sizeof (bucket)); root[k].number_followers = triesize-k-1; copying (root[k].next, root+k+1, triesize-1-k); }//trie::extend // push contents of buckets to the beginning of the array, // leaving number of them (call by reference parameter!) void trie::cleanup (bucket * & trienode, short & number) { short i, j = 0; for ( i = 0; i < number; i++ ) { if ( trienode[i].number_followers > 0 ) cleanup (trienode[i].next, trienode[i].number_followers); if ( trienode[i].count >= minsup ) { trienode[j] = trienode[i]; j++; }//if }//for // memory usage optimizer: trienode = (bucket*) realloc (trienode, j*sizeof (bucket)); number = j; // call by reference parameter! }//trie::cleanup // copy trie structure from q to p void trie::copying (struct bucket *p, struct bucket *q, int number_q_buckets) { short temp; for ( int source = 0; source < number_q_buckets; source++ ) { p->count = 0; p->itemvalue = q->itemvalue; p->number_followers = temp = q->number_followers; if ( temp > 0 ) { p->next = (bucket*) calloc (temp, sizeof (bucket)); copying (p->next, q->next, temp); }//if else p->next = NULL; p++; // pointer arithmetic q++; }//for }//trie::copying // do the counting: process current transaction recursively through trienode void count (bucket *trienode, short number_buckets) { for ( short i = 0; i < number_buckets; i++ ) { if ( inspect (trienode->itemvalue) ) { trienode->count++; if ( trienode->number_followers > 0 ) count (trienode->next, trienode->number_followers); }//if trienode++; // pointer arithmetic }//for }//count // print resulting trie and frequency of each pattern length void trie::printtrie (char *outputdata) { int k; for ( k = 0; k < MAXDEPTH; k++ ) length_count[k] = 0; outfilename = fopen (outputdata, "w"); fprintf (outfilename, "(%d)\n", initialcounts::lines); printout (1, root, triesize); int lpl; for ( lpl = MAXDEPTH-1; lpl >= 0 && length_count[lpl] == 0; lpl-- ) ; if ( initialcounts::lines >= minsup ) printf ("1\n"); for ( k = 1; k <= lpl; k++ ) printf ("%d\n", length_count[k]); fclose(outfilename); }//trie::printtrie // do the printing void trie::printout (int depth, struct bucket *trienode, int number_buckets) { for ( int i = 0; i < number_buckets; i++ ) { results[depth] = trienode[i].itemvalue; length_count[depth]++; for ( int j = 1; j <= depth; j++ ) fprintf (outfilename, "%d ", initialcounts::itemsorder[results[j]]); if ( depth > 1 ) fprintf (outfilename, "(%d)\n", trienode[i].count); else fprintf (outfilename, "(%d)\n", initialcounts::items_frequency[i]); if ( trienode[i].number_followers > 0 ) printout (depth+1, trienode[i].next, trienode[i].number_followers); }//for }//trie::printout // main program int main (int argc, char *argv[ ]) { // long int time1, time2, c; // time1 = time (&c); minsup = macro_minsup; // printf ("\nReading input starts ...\n"); initialcounts initialdata (input_filename); data_array datagrid (input_filename); // printf ("Reading input finished\n"); // time2 = time (&c); // printf ("Execution time so far: %ld\n", time2-time1); // printf ("Counting starts ...\n"); // time1 = time (&c); trie ourtrie (initialdata,datagrid); ourtrie.build_up ( ); // time2 = time (&c); // printf ("Execution time - counting: %ld\n\n", time2-time1); if ( argc > 3 ) ourtrie.printtrie (output_filename); return 0; }//main