花之剑'HOME

一朵飘舞在风中的雪花,挣扎着,不想被融化。

原创:用C写的小型网页分词程序

Posted on 2007-05-13 00:10 花之剑 阅读(364) 评论(0)  编辑  收藏 所属分类: c/c++ & algorithm

  用C写的小型电子商务购物搜索引擎网页分词程序 
  网页的抓取是用Larbin实现的如图
 
         ''~``
                        ( o o )
web index文件 

+------------------.oooO--(_)--Oooo.---------------------+
|Lantin                                            |
|                       E-mail: lantin_fang@163.com  |
|
|                               |
+---------------------\ (----(   )-----------------------+
                       \_)    ) /
                             (_/

  1#include "global.h"
  2#include  "types.h"
  3struct WordStruct words[1000];
  4int wordsLen;        //切词词典的长度
  5int fileNum=0;
  6FILE * indexFp;
  7
  8//打开索引表文件
  9void openIndex()
 10{
 11    if((indexFp=fopen("result.txt","a+"))==NULL)
 12    {
 13        perror("open the indexResultFile error");
 14        exit(0);
 15    }
 16}
 17
 18
 19//wirte 将结果写入到文件中
 20void writeWords(char *words)
 21{
 22    fprintf(indexFp,"%s",words);
 23}
 24//char *words;/*分词字典*/
 25 void getWords()
 26 {
 27     FILE * wordsFp;
 28    if((wordsFp=fopen("word","r"))==NULL)
 29    {
 30        perror("open the indexFile error");
 31        exit(0);
 32    }
 33    FILE * in;
 34    if((in=fopen("wo.txt","w"))==NULL)
 35    {
 36        perror("open error");
 37    }
 38    int i=0;
 39    while(fgets(words[i].words,10, wordsFp)){   
 40          int size=strlen(words[i].words);
 41          words[i].words[size-1]='\0';
 42          fprintf(in,"%d  %s %d\n",i+33,words[i].words,1);
 43
 44           i++;
 45        }  
 46        wordsLen=i;
 47    
 48 }
 49
 50
 51char   *prasefilename(char   *filename,int   pos,int   len)   
 52  {   
 53          char   *buffer;   
 54          int   i,j=0,slen;   
 55      
 56          slen=strlen(filename);   
 57          buffer=(char   *)malloc(sizeof(char)*slen);   
 58          for(i=pos;i<len;i++)   //
 59          {           
 60                  if(i<=slen)   
 61                          buffer[j++]=filename[i];   
 62                  else   
 63                          break;   
 64          }   
 65          buffer[j]='\0';   
 66      
 67          return(buffer);   
 68  }               
 69   
 70
 71 int dealWords(char *p,int position)
 72  {
 73      int singleLen=strlen(p);
 74      int k=singleLen;
 75      while(k>0)
 76      {    
 77         char *tmp=prasefilename(p,0,k);
 78         
 79         if(match(tmp,position)==1)
 80            {  
 81    (unit.keywords,tmp);
 82                        
 83fprintf(indexFp,"%s\n","    ");
 84                if(k<singleLen)
 85                 dealWords(prasefilename(p,k,singleLen),position);
 86                break;
 87            }else if(k<4)
 88            {
 89                dealWords(prasefilename(p,k,singleLen),position);
 90                break;
 91            }else
 92            {
 93                //printf("k>4  %s\n",tmp);
 94            }
 95             k=k-2;
 96     }
 97     if(singleLen==2)
 98        return 1;
 99
100  }
101
102int match(char *str,int position)
103  {
104      int i=0;
105      while(i<wordsLen)
106      {  
107          if(strcmp(words[i].words,str)==0)
108          {
109            fprintf(indexFp,"%3d%10s%10d",fileNum,str,position);
110            return 1;
111            break;
112          }
113          else
114          {    i++;
115            //continue;
116          }
117         
118      }
119     if(i==wordsLen)
120        return 0;
121    
122
123
124  }
125
126 
127 //对给定的字符串进行切割
128 void cutWords(char *str)
129 {
130     char *p,*buffer,*string;
131     int i,num;
132     string=str;
133     p = strtok(string," ");
134     int position=0;
135     while( p!= NULL ) 
136    { 
137        
138        /* While there are tokens in "string" */ 
139        if(strlen(p)>=10)
140        {
141            buffer=p;
142            num=strlen(p)/10;
143            for(i=0;i<num;i++)
144            {    
145                char *tmp=prasefilename(buffer,10*i,10*i+10);
146                dealWords(tmp,position);
147                                
148            }
149        
150        }else
151        {
152                
153                dealWords(p,position);
154
155        }
156        position+=strlen(p)+1;
157        p = strtok(NULL," "); 
158    /*注意到上面这个NULL,它表明的是从上次调用结果中strtok自有的缓冲区中继续取出余下的子串*/ 
159    } 
160
161
162 }
163int main(int argc,char argv[])
164{
165    
166    //打开切词词典,存入缓冲中,以备下调用
167      getWords();
168
169    //打开结果索引文件,将三元组存在此文件中
170    openIndex();
171
172   DIR *dir;
173   struct dirent *ptr;
174   int i=0;
175   dir=opendir("d00000/");
176    
177   while((ptr=readdir(dir))!= NULL)
178   {    
179        char htmlFileContent[1024*400];
180        char textFileContent[1024*400];
181        char reviseContent[1024*400];
182        i++;
183        if(i<3)
184         continue;
185         //printf("d_name: %s\nn", ptr->d_name);
186         //char resultFileName[]=    "tt/";
187         char fileName[]=        "d00000/";
188         strcat(fileName,ptr->d_name);
189         //strcat(resultFileName,ptr->d_name);
190        printf("网页 %s     ==>    分词完毕\n",fileName);
191        //打开html page
192         openPage(fileName, htmlFileContent);
193    
194        //将html网页转换为文本信息
195         HtmlToText(htmlFileContent,textFileContent);
196    
197        //对文本信息进行过滤
198         ReviseString(textFileContent,reviseContent);
199    
200    //    printf("text=%s\n",reviseContent);
201        //printf("text=%s\n",reviseContent);
202        fileNum++;
203
204  }
205   closedir(dir);
206   fclose(indexFp);
207
208    return 0;
209}
210
211void openPage(char *pageName,char  *htmlFileContent)
212{
213    FILE *fp;
214    long int size; //定义该文件大小
215    if ((fp=fopen(pageName,"r"))==NULL)
216    {
217        printf("cannot open file\n");
218        exit(0);
219        
220    }
221
222    fseek(fp,0,SEEK_END);
223    size=ftell(fp);
224    //buff=(char *) malloc(size);
225    fseek(fp,0,SEEK_SET);
226    //读取指定大小文件
227    if (fread(htmlFileContent,1,size,fp)==0) {
228        printf("read error!!");
229        //exit(0);
230    };
231    fclose(fp);
232}
233
234
235//html to text
236void HtmlToText(char* inbuffer,char* outbuffer)   
237  {   
238    int   bIsText   =  0;  
239    while(*inbuffer)   
240    {   
241        if(*inbuffer   ==   '<')     
242        {   
243            bIsText   =  0;   
244        }   
245        else   if(*inbuffer   ==   '>')   
246        {   
247            bIsText   =  1;   
248            inbuffer++;   
249            continue;   
250        };   
251        if(bIsText)   
252        {   
253
254                *outbuffer  =   *inbuffer;   
255                outbuffer++;   
256                *outbuffer   =   '\0';   
257        }   
258        inbuffer++;   
259    }   
260    
261  }
262
263  /*整理字符串(对标点符号,中英文混排等初步处理)*/
264  void  ReviseString(char *str,char *reviseContent)
265  {
266    char splitChar =' ';        //分割符号
267    long int slen=strlen(str);
268    int prechar=0;                // 0-空白 1-英文 2-中文 3-符号
269    long int i=0;
270    while(i++<slen)
271    {
272        if(str[i]<=64 && str[i]>0 )
273        {
274            if(prechar==0)
275            {
276                if(str[i]=='\n' || str[i]=='\r' || str[i]==' ')
277                {
278                  continue;
279                }
280            }else
281            {
282                strncat(reviseContent,&splitChar,1);prechar=0;
283                continue;    
284            }
285        }else if(str[i]>64)
286        { 
287            if(prechar==2 || prechar==3){strncat(reviseContent,&splitChar,1); }
288            strncat(reviseContent,&str[i],1);
289            prechar=1;
290        }else if(str[i]<0)
291        {
292            if(prechar!=0&& prechar!=2){strncat(reviseContent,&splitChar,1);}
293            strncat(reviseContent,&str[i],1);
294            prechar=2;
295
296        }
297        
298    }
299
300    /*
301     *调用切词程序
302    */
303    //fprintf(indexFp,"%s",reviseContent);
304    cutWords(reviseContent);
305
306  }
307

只有注册用户登录后才能发表评论。


网站导航: