#include #include #include #include #include #include #include "utils/type.h" using namespace std; #define MAX_WORD_LEN 50 #define MIN_WORD_LEN 3 //a valid word cannot: //1. exceed length MAX_WORD_LEN //2. short words with length 1 and 2 //3. both start and end with digits (e.g. contain only digits) void filter(const char *str, vector& result) { char buf[MAX_WORD_LEN + 1]; int count=0; int len=strlen(str); bool overflow = false; for(int i=0; i 0) { if(overflow)//ignore overflowed word with unreasonable length { overflow = false; count=0; } else { buf[count]='\0'; if(isalpha(buf[0]) || isalpha(buf[count-1])) if(count >= MIN_WORD_LEN) result.push_back(buf); count=0; } } } if(count>0) { buf[count]='\0'; if(isalpha(buf[0]) || isalpha(buf[count-1])) if(count >= MIN_WORD_LEN) result.push_back(buf); } } //*************************************** //Define index class. Used when application will use load index function class InvIdx { public: hash_map > inv_index; //API void insert_content(string & content, int vpos) { vector results; filter(content.c_str(), results); for (int i = 0;i < results.size();i++) { inv_index[results[i]].push_back(vpos); } } void insert(string content, int vpos) { inv_index[content].push_back(vpos); } vector& vpos_list(string key) { return inv_index[key]; } }; //*************************************** char all_one_array[9] = {0,1,3,7,15,31,63,127,-1};//0,1,11,111,1111,11111,111111,1111111,11111111 //Helper functions: operation on bitmap void setbit(char & bitmap, int bit) //'bit' should belong to [0,7] { bitmap |= (1 << bit); } int getBit(char & bitmap, int bit) //'bit' should belong to [0,7] { return ((bitmap & (1 << bit)) == 0) ? 0 : 1; } bool dominates(char &bitmap1, char &bitmap2) //question:when to use? { if(bitmap1 == bitmap2) return false; return ((bitmap1 | bitmap2) == bitmap1); } bool isAllOne(char &bitmap, int num_of_keywords) { return (bitmap == all_one_array[num_of_keywords]); }