随笔-23  评论-58  文章-0  trackbacks-0
基于词典的逆向最大匹配中文分词算法,能实现中英文数字混合分词。比如能分出这样的词:bb霜、3室、乐phone、touch4、mp3、T恤。实际分词效果比正向分词效果好

查看第2版:逆向最大匹配分词程序,能实现中英文数字混合分词 (第二版)

public class RMM
{
    
private static final Log log = LogFactory.getLog(RMM.class);
    
    
private static HashMap<String, Integer> dictionary = null
    
private static final int WORD_MAX_LENGTH = 9;
    
    
static
    
{
        loadDictionary();
    }

    
    
//将句子切分出词,逆向最大匹配
    public static ArrayList<Token> getToken(ArrayList<Sentence> list) throws IOException
    
{
        Collections.reverse(list);
        ArrayList
<Token> tokenlist=new ArrayList<Token>();
        
for(Sentence sen:list)
        
{
            StringBuffer word 
= new StringBuffer();
            
int offset=sen.getStartOffset()+sen.getText().length;
            
int bufferIndex = sen.getText().length-1;
            
char c;
            
boolean b=false;
            
while(bufferIndex>-1)
            
{
                offset
--;
                c
=sen.getText()[bufferIndex--];
                
if(word.length()==0)
                    word.append(c);
                
else
                
{
                    String temp 
= (c+word.toString()).intern();
                    
if(dictionary.containsKey(temp) && dictionary.get(temp)==1)
                        word.insert(
0, c);
                    
else if(dictionary.containsKey(temp) && bufferIndex>-1)
                        word.insert(
0, c);
                    
else
                    
{
                        bufferIndex
++;
                        offset
++;
                        
while(word.length()>1 && dictionary.get(word.toString())!=null && dictionary.get(word.toString())==2)
                        
{
                            word.deleteCharAt(
0);
                            bufferIndex
++;
                            offset
++;
                        }

                        b
=true;
                    }

                }

                
if(b || bufferIndex==-1)
                
{
                    Token token 
= new Token(word.toString(),offset,offset+word.length(),"word");
                    word.setLength(
0);
                    tokenlist.add(token);
                    b
=false;
                }

            }

        }

        Collections.reverse(tokenlist);
        
return tokenlist;
    }

    
    
//加载词典
    public static void loadDictionary() 
    
{  
        
if (dictionary == null
        
{    
            dictionary 
= new HashMap<String, Integer>();    
            InputStream is 
= null;    
            BufferedReader br 
= null;            
            
try
            
{
                is 
= new FileInputStream(new File(RMM.class.getClassLoader().getResource("dictionary.txt").toURI()));
                br 
= new BufferedReader(new InputStreamReader(is, "UTF-8"));
                String word 
= null;
                
while ((word = br.readLine()) != null
                
{
                    word
=word.toLowerCase();
                    
if ((word.indexOf("#"== -1&& (word.length() <= WORD_MAX_LENGTH))
                    
{
                        dictionary.put(word.intern(), 
1);    
                        
int i = 1
                        
while(i < word.length()-1)
                        
{
                            String temp 
= word.substring(i,word.length()).intern(); 
                            
if (!dictionary.containsKey(temp))
                                dictionary.put(temp,
2); 
                            i
++;
                        }

                    }

                }

            }

            
catch (Exception e) 
            
{      
                log.info(e);
            }

            
finally
            
{
                
try 
                
{      
                    
if(br!=null)
                        br.close();   
                    
if(is!=null)
                        is.close();  
                }

                
catch (IOException e)
                
{     
                    log.info(e);
                }
            
            }
 
        }
 
    }

    
    
public static String[] segWords(Reader reader)
    
{
        ArrayList
<String> list=new ArrayList<String>();
        
try
        
{
            ArrayList
<Token> tlist= Util.getNewToken(getToken(Util.getSentence(reader)));
            
for(Token t:tlist)
            
{
                list.add(t.getWord());
            }

        }

        
catch(IOException e)
        
{
            log.info(e);
        }

        
return (String[])list.toArray(new String[0]);
    }

    
    
public static void main(String[] args) 
     
{
        String[] cc
=RMM.segWords(new StringReader("急、急、急、花里林居,二房二厅,业主诚心,出租".toLowerCase()));
        
for(String c:cc)
        
{
            System.out.println(c);
        }

    }

}


public class Util
{
 //切分出由中文、字母、数字组成的句子
 public static ArrayList<Sentence> getSentence(Reader reader) throws IOException
 {  
  ArrayList<Sentence> list=new ArrayList<Sentence>();
  StringBuffer cb=new StringBuffer();
  int d=reader.read();
  int offset=0;
  boolean b=false;
  while(d>-1)
  {
   int type=Character.getType(d);
   if(type==2 || type==9 || type==5)
   {
    d=toAscii(d);
    cb.append((char)d);
   }
   else
   {
    b=true;
   }
   d=reader.read();
   if(d==-1 || b)
   {
    if(d==-1) offset++;
    b=false;
    char[] ioBuffer = new char[cb.length()];
    cb.getChars(0, cb.length(), ioBuffer, 0);
    Sentence sen=new Sentence(ioBuffer,offset-cb.length());
    list.add(sen);
    cb.setLength(0);
   }
   offset++;
  }
  return list;
 }
 
 //将相连的单个英文或数字组合成词
 public static ArrayList<Token> getNewToken(ArrayList<Token> list) throws IOException
 {
  ArrayList<Token> tokenlist=new ArrayList<Token>();
  Token word=null;
  for(int i=0;i<list.size();i++)
  {
   Token t=list.get(i);
   if(t.getWord().length()==1 && Character.getType((int)t.getWord().charAt(0))!=5)
   {
    if(word==null)
     word=t;
    else if(word.getEnd()==t.getStart())
    {
     word.setEnd(t.getEnd());
     word.setWord(word.getWord()+t.getWord());
    }
    else
    {
     tokenlist.add(word);
     word=t;
    }
   }
   else if(word!=null)
   {
    tokenlist.add(word);
    word=null;
    tokenlist.add(t);
   }
   else
    tokenlist.add(t);
  }
  if(word!=null)
   tokenlist.add(word);
  return tokenlist;
 }
 
 //双角转单角
 public static int toAscii(int codePoint)
 {
  if((codePoint>=65296 && codePoint<=65305) //0-9
    || (codePoint>=65313 && codePoint<=65338) //A-Z
    || (codePoint>=65345 && codePoint<=65370) //a-z
    )
  { 
   codePoint -= 65248;
  }
  return codePoint;
 }
}








posted on 2011-08-19 13:22 nianzai 阅读(4482) 评论(2)  编辑  收藏 所属分类: 中文分词

评论:
# re: 基于词典的逆向最大匹配中文分词算法,逆向分词比正向分词效果好 [未登录] 2011-10-21 16:38 | zxj
楼主,代码中的Sentence 类呢?  回复  更多评论
  
# re: 基于词典的逆向最大匹配中文分词算法,逆向分词比正向分词效果好 2011-11-08 15:55 | nianzai
参考正向最大匹配中文分词算法  回复  更多评论
  

只有注册用户登录后才能发表评论。


网站导航: