随笔-23  评论-58  文章-0  trackbacks-0
基于词典的正向最大匹配中文分词算法,能实现中英文数字混合分词。比如能分出这样的词:bb霜、3室、乐phone、touch4、mp3、T恤

第一次写中文分词程序,欢迎拍砖。

查看第2版:正向最大匹配分词程序,能实现中英文数字混合分词 (第二版)

public class MM2 
{
    
private static final Log log = LogFactory.getLog(MM2.class);
    
    
private static HashMap<String, Integer> dictionary = null
    
private static final int WORD_MAX_LENGTH = 9;
    
private Reader reader;
    
    
static
    
{
        loadDictionary();
    }

    
    
public MM2(Reader reader) 
    

        
this.reader = reader; 
    }
 
    
    
//切分出由中文、字母、数字组成的句子
    public ArrayList<Sentence> getSentence() throws IOException
    
{   
        ArrayList
<Sentence> list=new ArrayList<Sentence>();
        StringBuffer cb
=new StringBuffer();
        
int d=reader.read();
        
int offset=0;
        
boolean b=false;
        
while(d>-1)
        
{
            
int type=Character.getType(d);
            
if(type==2 || type==9 || type==5)
            
{
                d
=toAscii(d);
                cb.append((
char)d);
            }

            
else
            
{
                b
=true;
            }

            d
=reader.read();
            
if(d==-1 || b)
            
{
                
if(d==-1) offset++;
                b
=false;
                
char[] ioBuffer = new char[cb.length()];
                cb.getChars(
0, cb.length(), ioBuffer, 0);
                Sentence sen
=new Sentence(ioBuffer,offset-cb.length());
                list.add(sen);
                cb.setLength(
0);
            }

            offset
++;
        }

        
return list;
    }

    
    
//将句子切分出词
    public ArrayList<Token> getToken(ArrayList<Sentence> list) throws IOException
    
{
        ArrayList
<Token> tokenlist=new ArrayList<Token>();
        
for(Sentence sen:list)
        
{
            StringBuffer word 
= new StringBuffer();
            
int offset=sen.getStartOffset();
            
int bufferIndex = 0;
            
char c;
            
boolean b=false;
            
while(bufferIndex<sen.getText().length)
            
{
                offset
++;
                c
=sen.getText()[bufferIndex++];
                
if(word.length()==0)
                    word.append(c);
                
else
                
{
                    String temp 
= (word.toString() + c).intern();
                    
if(dictionary.containsKey(temp) && dictionary.get(temp)==1)
                        word.append(c);
                    
else if(dictionary.containsKey(temp) && bufferIndex<sen.getText().length)
                        word.append(c);
                    
else
                    
{
                        bufferIndex
--;
                        offset
--;
                        
while(word.length()>1 && dictionary.get(word.toString())!=null && dictionary.get(word.toString())==2)
                        
{
                            word.deleteCharAt(word.length()
-1);
                            bufferIndex
--;
                            offset
--;
                        }

                        b
=true;
                    }

                }

                
if(b || bufferIndex==sen.getText().length)
                
{
                    Token token 
= new Token(word.toString(),offset-word.length(),offset,"word");
                    word.setLength(
0);
                    tokenlist.add(token);
                    b
=false;
                }

            }

        }

        
return tokenlist;
    }

    
    
//将相连的单个英文或数字组合成词
    public ArrayList<Token> getNewToken(ArrayList<Token> list) throws IOException
    
{
        ArrayList
<Token> tokenlist=new ArrayList<Token>();
        Token word
=null;
        
for(int i=0;i<list.size();i++)
        
{
            Token t
=list.get(i);
            
if(t.getWord().length()==1 && Character.getType((int)t.getWord().charAt(0))!=5)
            
{
                
if(word==null)
                    word
=t;
                
else if(word.getEnd()==t.getStart())
                
{
                    word.setEnd(t.getEnd());
                    word.setWord(word.getWord()
+t.getWord());
                }

                
else
                
{
                    tokenlist.add(word);
                    word
=t;
                }

            }

            
else if(word!=null)
            
{
                tokenlist.add(word);
                word
=null;
                tokenlist.add(t);
            }

            
else
                tokenlist.add(t);
        }

        
if(word!=null)
            tokenlist.add(word);
        
return tokenlist;
    }

    
    
//双角转单角
    public static int toAscii(int codePoint) 
    
{
        
if((codePoint>=65296 && codePoint<=65305)    //0-9
                || (codePoint>=65313 && codePoint<=65338)    //A-Z
                || (codePoint>=65345 && codePoint<=65370)    //a-z
                )
        
{    
            codePoint 
-= 65248;
        }

        
return codePoint;
    }

    
    
//加载词典
    public static void loadDictionary() 
    
{  
        
if (dictionary == null
        
{    
            dictionary 
= new HashMap<String, Integer>();    
            InputStream is 
= null;    
            BufferedReader br 
= null;            
            
try
            
{
                is 
= new FileInputStream(new File(MM2.class.getClassLoader().getResource("dictionary.txt").toURI()));
                br 
= new BufferedReader(new InputStreamReader(is, "UTF-8"));
                String word 
= null;
                
while ((word = br.readLine()) != null
                
{
                    word
=word.toLowerCase();
                    
if ((word.indexOf("#"== -1&& (word.length() <= WORD_MAX_LENGTH))
                    
{
                        dictionary.put(word.intern(), 
1);    
                        
int i = word.length()-1
                        
while(i >= 2)
                        
{
                            String temp 
= word.substring(0, i).intern(); 
                            
if (!dictionary.containsKey(temp))
                                dictionary.put(temp,
2); 
                            i
--;
                        }

                    }

                }

            }

            
catch (Exception e) 
            
{      
                log.info(e);
            }

            
finally
            
{
                
try 
                
{      
                    
if(br!=null)
                        br.close();   
                    
if(is!=null)
                        is.close();  
                }

                
catch (IOException e)
                
{     
                    log.info(e);
                }
            
            }
 
        }
 
    }

    
    
public static String[] segWords(Reader input)
    
{
        ArrayList
<String> list=new ArrayList<String>();
        
try
        
{
            MM2 f
=new MM2(input);
            ArrayList
<Token> tlist= f.getNewToken(f.getToken(f.getSentence()));
            
for(Token t:tlist)
            
{
                list.add(t.getWord());
            }

        }

        
catch(IOException e)
        
{
            log.info(e);
        }

        
return (String[])list.toArray(new String[0]);
    }

    
    
public static void main(String[] args) 
    
{
        String[] cc
=MM2.segWords(new StringReader("ibm商务机t60p".toLowerCase()));
        
for(String c:cc)
        
{
            System.out.println(c);
        }

    }

}
posted on 2011-08-04 15:31 nianzai 阅读(3456) 评论(1)  编辑  收藏 所属分类: 中文分词

评论:
# re: 基于词典的正向最大匹配中文分词算法,能实现中英文数字混合分词 2014-09-13 18:30 | 余道
您好,您没有给出Sentence和Token的定义,我猜不出啊

hdwgz@qq.com  回复  更多评论
  

只有注册用户登录后才能发表评论。


网站导航: