|
Posted on 2007-05-13 00:10 花之剑 阅读(367) 评论(0) 编辑 收藏 所属分类: c/c++ & algorithm
用C写的小型电子商务购物搜索引擎网页分词程序 网页的抓取是用Larbin实现的如图 ''~`` ( o o ) web index文件
+------------------.oooO--(_)--Oooo.---------------------+ |Lantin | | E-mail: lantin_fang@163.com | | | | +---------------------\ (----( )-----------------------+ \_) ) / (_/
1#include "global.h" 2#include "types.h" 3struct WordStruct words[1000]; 4int wordsLen; //切词词典的长度 5int fileNum=0; 6FILE * indexFp; 7 8//打开索引表文件 9void openIndex() 10{ 11 if((indexFp=fopen("result.txt","a+"))==NULL) 12 { 13 perror("open the indexResultFile error"); 14 exit(0); 15 } 16} 17 18 19//wirte 将结果写入到文件中 20void writeWords(char *words) 21{ 22 fprintf(indexFp,"%s",words); 23} 24//char *words;/*分词字典*/ 25 void getWords() 26 { 27 FILE * wordsFp; 28 if((wordsFp=fopen("word","r"))==NULL) 29 { 30 perror("open the indexFile error"); 31 exit(0); 32 } 33 FILE * in; 34 if((in=fopen("wo.txt","w"))==NULL) 35 { 36 perror("open error"); 37 } 38 int i=0; 39 while(fgets(words[i].words,10, wordsFp)){ 40 int size=strlen(words[i].words); 41 words[i].words[size-1]='\0'; 42 fprintf(in,"%d %s %d\n",i+33,words[i].words,1); 43 44 i++; 45 } 46 wordsLen=i; 47 48 } 49 50 51char *prasefilename(char *filename,int pos,int len) 52 { 53 char *buffer; 54 int i,j=0,slen; 55 56 slen=strlen(filename); 57 buffer=(char *)malloc(sizeof(char)*slen); 58 for(i=pos;i<len;i++) // 59 { 60 if(i<=slen) 61 buffer[j++]=filename[i]; 62 else 63 break; 64 } 65 buffer[j]='\0'; 66 67 return(buffer); 68 } 69 70 71 int dealWords(char *p,int position) 72 { 73 int singleLen=strlen(p); 74 int k=singleLen; 75 while(k>0) 76 { 77 char *tmp=prasefilename(p,0,k); 78 79 if(match(tmp,position)==1) 80 { 81 (unit.keywords,tmp); 82 83fprintf(indexFp,"%s\n"," "); 84 if(k<singleLen) 85 dealWords(prasefilename(p,k,singleLen),position); 86 break; 87 }else if(k<4) 88 { 89 dealWords(prasefilename(p,k,singleLen),position); 90 break; 91 }else 92 { 93 //printf("k>4 %s\n",tmp); 94 } 95 k=k-2; 96 } 97 if(singleLen==2) 98 return 1; 99 100 } 101 102int match(char *str,int position) 103 { 104 int i=0; 105 while(i<wordsLen) 106 { 107 if(strcmp(words[i].words,str)==0) 108 { 109 fprintf(indexFp,"%3d%10s%10d",fileNum,str,position); 110 return 1; 111 break; 112 } 113 else 114 { i++; 115 //continue; 116 } 117 118 } 119 if(i==wordsLen) 120 return 0; 121 122 123 124 } 125 126 127 //对给定的字符串进行切割 128 void cutWords(char *str) 129 { 130 char *p,*buffer,*string; 131 int i,num; 132 string=str; 133 p = strtok(string," "); 134 int position=0; 135 while( p!= NULL ) 136 { 137 138 /* While there are tokens in "string" */ 139 if(strlen(p)>=10) 140 { 141 buffer=p; 142 num=strlen(p)/10; 143 for(i=0;i<num;i++) 144 { 145 char *tmp=prasefilename(buffer,10*i,10*i+10); 146 dealWords(tmp,position); 147 148 } 149 150 }else 151 { 152 153 dealWords(p,position); 154 155 } 156 position+=strlen(p)+1; 157 p = strtok(NULL," "); 158 /*注意到上面这个NULL,它表明的是从上次调用结果中strtok自有的缓冲区中继续取出余下的子串*/ 159 } 160 161 162 } 163int main(int argc,char argv[]) 164{ 165 166 //打开切词词典,存入缓冲中,以备下调用 167 getWords(); 168 169 //打开结果索引文件,将三元组存在此文件中 170 openIndex(); 171 172 DIR *dir; 173 struct dirent *ptr; 174 int i=0; 175 dir=opendir("d00000/"); 176 177 while((ptr=readdir(dir))!= NULL) 178 { 179 char htmlFileContent[1024*400]; 180 char textFileContent[1024*400]; 181 char reviseContent[1024*400]; 182 i++; 183 if(i<3) 184 continue; 185 //printf("d_name: %s\nn", ptr->d_name); 186 //char resultFileName[]= "tt/"; 187 char fileName[]= "d00000/"; 188 strcat(fileName,ptr->d_name); 189 //strcat(resultFileName,ptr->d_name); 190 printf("网页 %s ==> 分词完毕\n",fileName); 191 //打开html page 192 openPage(fileName, htmlFileContent); 193 194 //将html网页转换为文本信息 195 HtmlToText(htmlFileContent,textFileContent); 196 197 //对文本信息进行过滤 198 ReviseString(textFileContent,reviseContent); 199 200 // printf("text=%s\n",reviseContent); 201 //printf("text=%s\n",reviseContent); 202 fileNum++; 203 204 } 205 closedir(dir); 206 fclose(indexFp); 207 208 return 0; 209} 210 211void openPage(char *pageName,char *htmlFileContent) 212{ 213 FILE *fp; 214 long int size; //定义该文件大小 215 if ((fp=fopen(pageName,"r"))==NULL) 216 { 217 printf("cannot open file\n"); 218 exit(0); 219 220 } 221 222 fseek(fp,0,SEEK_END); 223 size=ftell(fp); 224 //buff=(char *) malloc(size); 225 fseek(fp,0,SEEK_SET); 226 //读取指定大小文件 227 if (fread(htmlFileContent,1,size,fp)==0) { 228 printf("read error!!"); 229 //exit(0); 230 }; 231 fclose(fp); 232} 233 234 235//html to text 236void HtmlToText(char* inbuffer,char* outbuffer) 237 { 238 int bIsText = 0; 239 while(*inbuffer) 240 { 241 if(*inbuffer == '<') 242 { 243 bIsText = 0; 244 } 245 else if(*inbuffer == '>') 246 { 247 bIsText = 1; 248 inbuffer++; 249 continue; 250 }; 251 if(bIsText) 252 { 253 254 *outbuffer = *inbuffer; 255 outbuffer++; 256 *outbuffer = '\0'; 257 } 258 inbuffer++; 259 } 260 261 } 262 263 /*整理字符串(对标点符号,中英文混排等初步处理)*/ 264 void ReviseString(char *str,char *reviseContent) 265 { 266 char splitChar =' '; //分割符号 267 long int slen=strlen(str); 268 int prechar=0; // 0-空白 1-英文 2-中文 3-符号 269 long int i=0; 270 while(i++<slen) 271 { 272 if(str[i]<=64 && str[i]>0 ) 273 { 274 if(prechar==0) 275 { 276 if(str[i]=='\n' || str[i]=='\r' || str[i]==' ') 277 { 278 continue; 279 } 280 }else 281 { 282 strncat(reviseContent,&splitChar,1);prechar=0; 283 continue; 284 } 285 }else if(str[i]>64) 286 { 287 if(prechar==2 || prechar==3){strncat(reviseContent,&splitChar,1); } 288 strncat(reviseContent,&str[i],1); 289 prechar=1; 290 }else if(str[i]<0) 291 { 292 if(prechar!=0&& prechar!=2){strncat(reviseContent,&splitChar,1);} 293 strncat(reviseContent,&str[i],1); 294 prechar=2; 295 296 } 297 298 } 299 300 /* 301 *调用切词程序 302 */ 303 //fprintf(indexFp,"%s",reviseContent); 304 cutWords(reviseContent); 305 306 } 307
|