/** * TreeCount, by David Greenspan. Reads words from the text file $1 * and prints out the top 25 most frequent words using a binary * tree algorithm. */ #include #include #define MAX_WORD_LENGTH 31 #define INPUT_BUFFER_SIZE 1024 #define NUM_MEMORY_WORDS (1024*1024) // 4 MB #define NUM_WORD_WORDS 8 #define NUM_TOP_WORDS 25 // assume 4-byte int int _mem[NUM_MEMORY_WORDS]; int _memSize = 0; int _letters[NUM_WORD_WORDS]; char _word[MAX_WORD_LENGTH+1]; int _topWords[NUM_TOP_WORDS]; int _curTopWords = 0; int _lowestTopWordCount = 0; char HEX_CHARS[] = {'0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'}; void wordToLetters(char *word, int *letters) { char *c = word; int index = 0; do { int w = (index >> 2); int b = (index & 3); if (b == 0) { letters[w] = 0; } letters[w] |= ((*c) << ((3-b)<<3)); index++; } while ((*(c++)) != 0); } void lettersToWord(int *letters, char *word) { char *c = word; int index = 0; do { int w = (index >> 2); int b = (index & 3); *c = ((letters[w] >> ((3-b)<<3)) & 0xFF); index++; } while ((*(c++)) != 0); } int letterCompare(int *letters1, int *letters2) { int *a = letters1; int *b = letters2; do { if (*a < *b) return -1; if (*a > *b) return 1; a++; b++; } while (((*(a-1)) & 0xFF) != 0); return 0; } void printTopWords() { int i; for(i=0;i<_curTopWords;i++) { int node = _topWords[i]; lettersToWord(&_mem[node+3], _word); printf("%d %s\n", _mem[node], _word); } } void registerTopWord(int node) { int newCount = _mem[node]; int pos; int insertionPos = _curTopWords; int oldPosFound = -1; int sliding = 0; if (newCount <= _lowestTopWordCount) { return; } pos = _curTopWords-1; while (pos >= 0) { int tw = _topWords[pos]; if (tw == node) { oldPosFound = pos; } int count = _mem[tw]; if (count > newCount) { break; } else { insertionPos = pos; } pos--; } if (oldPosFound >= 0) { sliding = 0; } else { sliding = 1; if (_curTopWords < NUM_TOP_WORDS) { _curTopWords++; } } pos = _curTopWords-1; while (pos > 0) { if (pos == oldPosFound) { sliding = 1; } if (pos == insertionPos) { break; } pos--; if (sliding) { _topWords[pos+1] = _topWords[pos]; } } _topWords[insertionPos] = node; if (_curTopWords == NUM_TOP_WORDS) { _lowestTopWordCount = _mem[_topWords[NUM_TOP_WORDS-1]]; } } void registerWordsRecursive(int node) { int left, right; if (node >= _memSize) return; registerTopWord(node); left = _mem[node+1]; right = _mem[node+2]; if (left != 0) registerWordsRecursive(left); if (right != 0) registerWordsRecursive(right); } void newWord(char *word) { int nodeLoc = 0; int newCount; wordToLetters(word, _letters); while (nodeLoc < _memSize) { int compare = letterCompare(_letters, &_mem[nodeLoc+3]); if (compare == 0) { break; } else { int ptrLoc, newLoc; if (compare == -1) ptrLoc = nodeLoc+1; else if (compare == 1) ptrLoc = nodeLoc+2; newLoc = _mem[ptrLoc]; if (newLoc == 0) { nodeLoc = _memSize; _mem[ptrLoc] = nodeLoc; } else nodeLoc = newLoc; } } if (nodeLoc >= _memSize) { int w; _mem[_memSize++] = 0; _mem[_memSize++] = 0; _mem[_memSize++] = 0; for(w=0;w 0) { int i; for(i=0;i= 'a' && c <= 'z') { wordBuffer[wordLength++] = c; } else if (c >= 'A' && c <= 'Z') { wordBuffer[wordLength++] = c + 'a' - 'A'; } else { if (wordLength > 0) { wordBuffer[wordLength] = '\0'; newWord(wordBuffer); wordLength = 0; } } } } registerWordsRecursive(0); printTopWords(); return 0; }