/** * HashCount, by David Greenspan. Reads words from the text file $1 * and prints out the top 25 most frequent words using a hashing * algorithm. */ #include #include #define MAX_WORD_LENGTH 31 #define INPUT_BUFFER_SIZE 1024 #define LETTER_DATA_SIZE (1024*1024) #define NUM_HASH_BINS 0x00010000 #define HASH_MASK 0x0000FFFF #define HASH_X 179424673 // the 10-millionth prime number #define NUM_TOP_WORDS 25 char _letters[LETTER_DATA_SIZE]; int _numLetters = 0; char *_hashBins[NUM_HASH_BINS]; int _hashBinCounts[NUM_HASH_BINS]; int _topWords[NUM_TOP_WORDS]; int _curTopWords = 0; int _lowestTopWordCount = 0; void printTopWords() { int i; for(i=0;i<_curTopWords;i++) { int w = _topWords[i]; printf("%d %s\n", _hashBinCounts[w], _hashBins[w]); } } void registerTopWord(int w) { int newCount = _hashBinCounts[w]; int insertionPos = _curTopWords; int pos; if (newCount <= _lowestTopWordCount) { return; } pos = _curTopWords-1; while (pos >= 0) { int tw = _topWords[pos]; int count = _hashBinCounts[tw]; if (count > newCount) { break; } else { insertionPos = pos; } pos--; } if (_curTopWords < NUM_TOP_WORDS) { _curTopWords++; } pos = _curTopWords-1; while (pos > 0) { if (pos == insertionPos) { break; } pos--; _topWords[pos+1] = _topWords[pos]; } _topWords[insertionPos] = w; if (_curTopWords == NUM_TOP_WORDS) { _lowestTopWordCount = _hashBinCounts[_topWords[NUM_TOP_WORDS-1]]; } } void registerAll() { int i; for(i=0;i 0) { int i; for(i=0;i= 'a' && c <= 'z') { wordBuffer[wordLength++] = c; } else if (c >= 'A' && c <= 'Z') { wordBuffer[wordLength++] = c + 'a' - 'A'; } else { if (wordLength > 0) { wordBuffer[wordLength] = '\0'; newWord(wordBuffer); wordLength = 0; } } } } registerAll(); printTopWords(); return 0; }