Look into it ~

present
随笔 - 32, 文章 - 0, 评论 - 3, 引用 - 0
数据加载中……

汉字转拼音

在网上参考了一些汉字转换到拼音的资料。思路应该只有以下两种。
1,查表法。这样做需要一个庞大的映射表,在j2me环境下不大合适。不过效果好,有些还支持多音字。
2,使用GB字库的映射关系。因为GB2312及其扩展GBK的汉字编码都根据区位于拼音存在映射关系。
实际上网络上的大部分文章都是根据第二种方法来实现的。

我也是采用这种方法,因为它基本上可以利用GB2312字库,直接映射成拼音。
如果遇到不支持GB2312的手机,请参考上一篇文章。
http://blog.csdn.net/hunhun1981/archive/2007/10/26/1845576.aspx

首先来介绍下原始的代码,由于这个代码在网上存在多个版本,并不知道原始作者是谁。
再次鄙视哪些剽窃人家东西不留名的人。
http://hibernate.blogdriver.com/hibernate/1036902.html
或者各位可以在百度中搜索“java 中文 拼音”,即可找到很多帖子,基本全是这份代码。

以下是一个重要片段。
 /**
     * 获得单个汉字的Ascii.
     * 
@param cn char
     * 汉字字符
     * 
@return int
     * 错误返回 0,否则返回ascii
     
*/
    
public static int getCnAscii(char cn) {
        
byte[] bytes = (String.valueOf(cn)).getBytes();
        
if (bytes == null || bytes.length > 2 || bytes.length <= 0) { //错误
            return 0;
        }
        
if (bytes.length == 1) { //英文字符
            return bytes[0];
        }
        
if (bytes.length == 2) { //中文字符
            int hightByte = 256 + bytes[0];
            
int lowByte = 256 + bytes[1];
            
int ascii = (256 * hightByte + lowByte) - 256 * 256;
            
//System.out.println("ASCII=" + ascii);
            return ascii;
        }
        
return 0//错误
    }


这个方法中有一个片段大家需要注意,“(String.valueOf(cn)).getBytes();”这一句。
这里获得的应该是GBK编码,可不是unicode。
unicode是无法使用现在的方法来获取拼音的,只能用查表法,弄一张对于j2me来说很庞大的映射表。

下面详细介绍一下我改进后的方法。并且对原始代码的作者表示感谢,虽然不知道他是谁。


public class HGB2PINYIN {
 
        
private String[] name = { "zuo""zun""zui""zuan""zu""zou",
                
"zong""zi""zhuo""zhun""zhui""zhuang""zhuan",
                
"zhuai""zhua""zhu""zhou""zhong""zhi""zheng",
                
"zhen""zhe""zhao""zhang""zhan""zhai""zha""zeng",
                
"zen""zei""ze""zao""zang""zan""zai""za""yun",
                
"yue""yuan""yu""you""yong""yo""ying""yin""yi",
                
"ye""yao""yang""yan""ya""xun""xue""xuan""xu",
                
"xiu""xiong""xing""xin""xie""xiao""xiang""xian",
                
"xia""xi""wu""wo""weng""wen""wei""wang""wan",
                
"wai""wa""tuo""tun""tui""tuan""tu""tou""tong",
                
"ting""tie""tiao""tian""ti""teng""te""tao",
                
"tang""tan""tai""ta""suo""sun""sui""suan""su",
                
"sou""song""si""shuo""shun""shui""shuang""shuan",
                
"shuai""shua""shu""shou""shi""sheng""shen""she",
                
"shao""shang""shan""shai""sha""seng""sen""se",
                
"sao""sang""san""sai""sa""ruo""run""rui""ruan",
                
"ru""rou""rong""ri""reng""ren""re""rao""rang",
                
"ran""qun""que""quan""qu""qiu""qiong""qing",
                
"qin""qie""qiao""qiang""qian""qia""qi""pu""po",
                
"ping""pin""pie""piao""pian""pi""peng""pen",
                
"pei""pao""pang""pan""pai""pa""ou""o""nuo",
                
"nue""nuan""nv""nu""nong""niu""ning""nin""nie",
                
"niao""niang""nian""ni""neng""nen""nei""ne",
                
"nao""nang""nan""nai""na""mu""mou""mo""miu",
                
"ming""min""mie""miao""mian""mi""meng""men",
                
"mei""me""mao""mang""man""mai""ma""luo""lun",
                
"lue""luan""lv""lu""lou""long""liu""ling""lin",
                
"lie""liao""liang""lian""lia""li""leng""lei",
                
"le""lao""lang""lan""lai""la""kuo""kun""kui",
                
"kuang""kuan""kuai""kua""ku""kou""kong""keng",
                
"ken""ke""kao""kang""kan""kai""ka""jun""jue",
                
"juan""ju""jiu""jiong""jing""jin""jie""jiao",
                
"jiang""jian""jia""ji""huo""hun""hui""huang",
                
"huan""huai""hua""hu""hou""hong""heng""hen",
                
"hei""he""hao""hang""han""hai""ha""guo""gun",
                
"gui""guang""guan""guai""gua""gu""gou""gong",
                
"geng""gen""gei""ge""gao""gang""gan""gai""ga",
                
"fu""fou""fo""feng""fen""fei""fang""fan""fa",
                
"er""en""e""duo""dun""dui""duan""du""dou",
                
"****""diu""ding""die""diao""dian""di""deng",
                
"de""dao""dang""dan""dai""da""cuo""cun""cui",
                
"cuan""cu""cou""cong""ci""chuo""chun""chui",
                
"chuang""chuan""chuai""chu""chou""chong""chi",
                
"cheng""chen""che""chao""chang""chan""chai""cha",
                
"ceng""ce""cao""cang""can""cai""ca""bu""bo",
                
"bing""bin""bie""biao""bian""bi""beng""ben",
                
"bei""bao""bang""ban""bai""ba""ao""ang""an",
                
"ai""a" };
 
        
private int[] code = { -10254-10256-10260-10262-10270-10274,
                
-10281-10296-10307-10309-10315-10322-10328-10329,
                
-10331-10519-10533-10544-10587-10764-10780-10790,
                
-10800-10815-10832-10838-11014-11018-11019-11020,
                
-11024-11038-11041-11045-11052-11055-11067-11077,
                
-11097-11303-11324-11339-11340-11358-11536-11589,
                
-11604-11781-11798-11831-11847-11861-11867-12039,
                
-12058-12067-12074-12089-12099-12120-12300-12320,
                
-12346-12359-12556-12585-12594-12597-12607-12802,
                
-12812-12829-12831-12838-12849-12852-12858-12860,
                
-12871-12875-12888-13060-13063-13068-13076-13091,
                
-13095-13096-13107-13120-13138-13147-13318-13326,
                
-13329-13340-13343-13356-13359-13367-13383-13387,
                
-13391-13395-13398-13400-13404-13406-13601-13611,
                
-13658-13831-13847-13859-13870-13878-13894-13896,
                
-13905-13906-13907-13910-13914-13917-14083-14087,
                
-14090-14092-14094-14097-14099-14109-14112-14122,
                
-14123-14125-14135-14137-14140-14145-14149-14151,
                
-14159-14170-14345-14353-14355-14368-14379-14384,
                
-14399-14407-14429-14594-14630-14645-14654-14663,
                
-14668-14670-14674-14678-14857-14871-14873-14882,
                
-14889-14894-14902-14908-14914-14921-14922-14926,
                
-14928-14929-14930-14933-14937-14941-15109-15110,
                
-15117-15119-15121-15128-15139-15140-15141-15143,
                
-15144-15149-15150-15153-15158-15165-15180-15183,
                
-15362-15363-15369-15375-15377-15385-15394-15408,
                
-15416-15419-15435-15436-15448-15454-15625-15631,
                
-15640-15652-15659-15661-15667-15681-15701-15707,
                
-15878-15889-15903-15915-15920-15933-15944-15958,
                
-15959-16155-16158-16169-16171-16180-16187-16202,
                
-16205-16212-16216-16220-16393-16401-16403-16407,
                
-16412-16419-16423-16427-16429-16433-16448-16452,
                
-16459-16465-16470-16474-16647-16657-16664-16689,
                
-16706-16708-16733-16915-16942-16970-16983-17185,
                
-17202-17417-17427-17433-17454-17468-17482-17487,
                
-17496-17676-17683-17692-17697-17701-17703-17721,
                
-17730-17733-17752-17759-17922-17928-17931-17947,
                
-17950-17961-17964-17970-17988-17997-18012-18181,
                
-18183-18184-18201-18211-18220-18231-18237-18239,
                
-18446-18447-18448-18463-18478-18490-18501-18518,
                
-18526-18696-18697-18710-18722-18731-18735-18741,
                
-18756-18763-18773-18774-18783-18952-18961-18977,
                
-18996-19003-19006-19018-19023-19038-19212-19218,
                
-19224-19227-19235-19238-19242-19243-19249-19261,
                
-19263-19270-19275-19281-19288-19289-19467-19479,
                
-19484-19500-19515-19525-19531-19540-19715-19725,
                
-19728-19739-19741-19746-19751-19756-19763-19774,
                
-19775-19784-19805-19976-19982-19986-19990-20002,
                
-20026-20032-20036-20051-20230-20242-20257-20265,
                
-20283-20292-20295-20304-20317-20319 };
 
        
public String getPinyin(String gb2312) {
            
if (null == gb2312 || "".equals(gb2312.trim())) {
                
return gb2312;
            }
            
char[] chars = gb2312.toCharArray();
            StringBuffer retuBuf 
= new StringBuffer();
            
for (int i = 0, Len = chars.length; i < Len; i++) {
                retuBuf.append(getPinyin(chars[i]));
            } 
// end of for
            return retuBuf.toString();
        }
 
        
public String getPinyin(char gb2312) {
            
int ascii = getCnAscii(gb2312);
            
if (ascii == 0) { // 取ascii时出错
                return String.valueOf(gb2312);
            } 
else {
                String spell 
= getSpellByAscii(ascii);
                
if (spell == null) {
                    
return String.valueOf(gb2312);
                } 
else {
                    
return spell;
                } 
// end of if spell == null
            }
        }
 
        
private int getCnAscii(char cn) {
            
byte[] bytes = null;
            
try {
                bytes 
= (String.valueOf(cn)).getBytes("GB2312");
            } 
catch (Exception e) {
                e.printStackTrace();
            }
            
if (bytes == null || bytes.length > 2 || bytes.length <= 0) {
                
return 0;
            }
            
if (bytes.length == 1) {
                
return bytes[0];
            } 
else {
                
int hightByte = 256 + bytes[0];
                
int lowByte = 256 + bytes[1];
                
int ascii = (256 * hightByte + lowByte) - 256 * 256;
                
return ascii;
            }
        }
 
        
private String getSpellByAscii(int ascii) {
            
if (ascii > 0 && ascii < 160) { // 单字符
                return String.valueOf((char) ascii);
            }
            
if (ascii < -20319 || ascii > -10247) { // 不知道的字符
                return "?";
            }
            
int ind;
            
for (ind = 0; ind < code.length; ind++) {
                
if (ascii > code[ind]) {
                    
break;
                }
            }
            
return name[ind];
        }
    }

首先我去除了比较多余的LinkedHashMap,同时解决了原始代码中查询时比较弱的查询方式。直接通过数组索引做判断(参考 getSpellByAscii函数,如果有兴趣,各位可以把循环判断区间改为二分法,效果会好很多,毕竟是395的数组,要找到区间最坏情况下要遍历整 个数组并判断395次)。

如果手机不支持GB2312,那么可以参考我以前的文章,修改getCnAscii函数,使用我提供的库将编码转换位GB2312。

感谢各位的阅读,转贴请注明出处。
欢迎有兴趣的朋友多多指点交流经验。

上一篇文章介绍了j2me环境下汉字转换为拼音的方法。
http://blog.csdn.net/hunhun1981/archive/2007/10/27/1846778.aspx

后来又完善了一下,追加了汉字转换为拼音首字母的方法。
来源是网上出现频率比较高的汉字转拼音的java代码。
但是我都做了优化,修正了一些小缺陷,使用数组替代容器,并且将编码转换的步骤省略,直接制作成映射表。而且,查找汉字编码区间的时候使用了类似二分查找的方法。所以速度快了很多很多。
当然,要文字足够多才能看出来。

在nokia5300上进行了测试,300字175毫秒。还算不错吧。

具体就不多说了。
大家可以直接使用getFirstPY()和getAllPY()方法获得字符串或者单个字符的全拼活着拼音首字母。
    public class HGB2PINYIN {
 
        
private final int[] FIRST_TABLE = { 4521745253457614631846826,
                
470104729747614476144811949062493244989650371,
                
506145062250906513875144652218522185221852698,
                
52980536895448155289 };
 
        
private final String[] ALL_VALUE = { "zuo""zun""zui""zuan""zu",
                
"zou""zong""zi""zhuo""zhun""zhui""zhuang""zhuan",
                
"zhuai""zhua""zhu""zhou""zhong""zhi""zheng",
                
"zhen""zhe""zhao""zhang""zhan""zhai""zha""zeng",
                
"zen""zei""ze""zao""zang""zan""zai""za""yun",
                
"yue""yuan""yu""you""yong""yo""ying""yin""yi",
                
"ye""yao""yang""yan""ya""xun""xue""xuan""xu",
                
"xiu""xiong""xing""xin""xie""xiao""xiang""xian",
                
"xia""xi""wu""wo""weng""wen""wei""wang""wan",
                
"wai""wa""tuo""tun""tui""tuan""tu""tou""tong",
                
"ting""tie""tiao""tian""ti""teng""te""tao",
                
"tang""tan""tai""ta""suo""sun""sui""suan""su",
                
"sou""song""si""shuo""shun""shui""shuang""shuan",
                
"shuai""shua""shu""shou""shi""sheng""shen""she",
                
"shao""shang""shan""shai""sha""seng""sen""se",
                
"sao""sang""san""sai""sa""ruo""run""rui""ruan",
                
"ru""rou""rong""ri""reng""ren""re""rao""rang",
                
"ran""qun""que""quan""qu""qiu""qiong""qing",
                
"qin""qie""qiao""qiang""qian""qia""qi""pu""po",
                
"ping""pin""pie""piao""pian""pi""peng""pen",
                
"pei""pao""pang""pan""pai""pa""ou""o""nuo",
                
"nue""nuan""nv""nu""nong""niu""ning""nin""nie",
                
"niao""niang""nian""ni""neng""nen""nei""ne",
                
"nao""nang""nan""nai""na""mu""mou""mo""miu",
                
"ming""min""mie""miao""mian""mi""meng""men",
                
"mei""me""mao""mang""man""mai""ma""luo""lun",
                
"lue""luan""lv""lu""lou""long""liu""ling""lin",
                
"lie""liao""liang""lian""lia""li""leng""lei",
                
"le""lao""lang""lan""lai""la""kuo""kun""kui",
                
"kuang""kuan""kuai""kua""ku""kou""kong""keng",
                
"ken""ke""kao""kang""kan""kai""ka""jun""jue",
                
"juan""ju""jiu""jiong""jing""jin""jie""jiao",
                
"jiang""jian""jia""ji""huo""hun""hui""huang",
                
"huan""huai""hua""hu""hou""hong""heng""hen",
                
"hei""he""hao""hang""han""hai""ha""guo""gun",
                
"gui""guang""guan""guai""gua""gu""gou""gong",
                
"geng""gen""gei""ge""gao""gang""gan""gai""ga",
                
"fu""fou""fo""feng""fen""fei""fang""fan""fa",
                
"er""en""e""duo""dun""dui""duan""du""dou",
                
"****""diu""ding""die""diao""dian""di""deng",
                
"de""dao""dang""dan""dai""da""cuo""cun""cui",
                
"cuan""cu""cou""cong""ci""chuo""chun""chui",
                
"chuang""chuan""chuai""chu""chou""chong""chi",
                
"cheng""chen""che""chao""chang""chan""chai""cha",
                
"ceng""ce""cao""cang""can""cai""ca""bu""bo",
                
"bing""bin""bie""biao""bian""bi""beng""ben",
                
"bei""bao""bang""ban""bai""ba""ao""ang""an",
                
"ai""a" };
 
        
private final int[] ALL_CODE = { -10254-10256-10260-10262,
                
-10270-10274-10281-10296-10307-10309-10315-10322,
                
-10328-10329-10331-10519-10533-10544-10587-10764,
                
-10780-10790-10800-10815-10832-10838-11014-11018,
                
-11019-11020-11024-11038-11041-11045-11052-11055,
                
-11067-11077-11097-11303-11324-11339-11340-11358,
                
-11536-11589-11604-11781-11798-11831-11847-11861,
                
-11867-12039-12058-12067-12074-12089-12099-12120,
                
-12300-12320-12346-12359-12556-12585-12594-12597,
                
-12607-12802-12812-12829-12831-12838-12849-12852,
                
-12858-12860-12871-12875-12888-13060-13063-13068,
                
-13076-13091-13095-13096-13107-13120-13138-13147,
                
-13318-13326-13329-13340-13343-13356-13359-13367,
                
-13383-13387-13391-13395-13398-13400-13404-13406,
                
-13601-13611-13658-13831-13847-13859-13870-13878,
                
-13894-13896-13905-13906-13907-13910-13914-13917,
                
-14083-14087-14090-14092-14094-14097-14099-14109,
                
-14112-14122-14123-14125-14135-14137-14140-14145,
                
-14149-14151-14159-14170-14345-14353-14355-14368,
                
-14379-14384-14399-14407-14429-14594-14630-14645,
                
-14654-14663-14668-14670-14674-14678-14857-14871,
                
-14873-14882-14889-14894-14902-14908-14914-14921,
                
-14922-14926-14928-14929-14930-14933-14937-14941,
                
-15109-15110-15117-15119-15121-15128-15139-15140,
                
-15141-15143-15144-15149-15150-15153-15158-15165,
                
-15180-15183-15362-15363-15369-15375-15377-15385,
                
-15394-15408-15416-15419-15435-15436-15448-15454,
                
-15625-15631-15640-15652-15659-15661-15667-15681,
                
-15701-15707-15878-15889-15903-15915-15920-15933,
                
-15944-15958-15959-16155-16158-16169-16171-16180,
                
-16187-16202-16205-16212-16216-16220-16393-16401,
                
-16403-16407-16412-16419-16423-16427-16429-16433,
                
-16448-16452-16459-16465-16470-16474-16647-16657,
                
-16664-16689-16706-16708-16733-16915-16942-16970,
                
-16983-17185-17202-17417-17427-17433-17454-17468,
                
-17482-17487-17496-17676-17683-17692-17697-17701,
                
-17703-17721-17730-17733-17752-17759-17922-17928,
                
-17931-17947-17950-17961-17964-17970-17988-17997,
                
-18012-18181-18183-18184-18201-18211-18220-18231,
                
-18237-18239-18446-18447-18448-18463-18478-18490,
                
-18501-18518-18526-18696-18697-18710-18722-18731,
                
-18735-18741-18756-18763-18773-18774-18783-18952,
                
-18961-18977-18996-19003-19006-19018-19023-19038,
                
-19212-19218-19224-19227-19235-19238-19242-19243,
                
-19249-19261-19263-19270-19275-19281-19288-19289,
                
-19467-19479-19484-19500-19515-19525-19531-19540,
                
-19715-19725-19728-19739-19741-19746-19751-19756,
                
-19763-19774-19775-19784-19805-19976-19982-19986,
                
-19990-20002-20026-20032-20036-20051-20230-20242,
                
-20257-20265-20283-20292-20295-20304-20317-20319 };
 
        
public String getAllPY(String gb2312) {
            
if (null == gb2312 || "".equals(gb2312.trim())) {
                
return gb2312;
            }
            
char[] chars = gb2312.toCharArray();
            StringBuffer retuBuf 
= new StringBuffer();
            
for (int i = 0, Len = chars.length; i < Len; i++) {
                retuBuf.append(getAllPY(chars[i]));
            } 
// end of for
            return retuBuf.toString();
        }
 
        
public String getAllPY(char gb2312) {
            
int ascii = getCnAscii(gb2312);
            
if (ascii == 0) { // 取ascii时出错
                return String.valueOf(gb2312);
            } 
else {
                String spell 
= getSpellByAscii(ascii);
                
if (spell == null) {
                    
return String.valueOf(gb2312);
                } 
else {
                    
return spell;
                } 
// end of if spell == null
            }
        }
 
        
public char getFirstPY(char ch) {
            
if (ch >= 0 && ch <= 0x7F) {
                
return ch;
            }
            
int gb = 0;
            
try {
                
byte[] bytes = String.valueOf(ch).getBytes("GB2312");
                
if (bytes.length < 2) {
                    gb 
= byte2Int(bytes[0]);
                }
                gb 
= (bytes[0<< 8 & 0xff00+ (bytes[1& 0xff);
            } 
catch (Exception e) {
                
return ch;
            }
            
if (gb < FIRST_TABLE[0])
                
return ch;
            
int i;
            
for (i = 0; i < 26++i) {
                
if (match(i, gb))
                    
break;
            }
            
if (i >= 26)
                
return ch;
            
else
                
return (char) (65 + i);
        }
 
        
public String getFirstPY(String src) {
            StringBuffer sb 
= new StringBuffer();
            
int len = src.length();
            
int i;
            
for (i = 0; i < len; i++) {
                sb.append(getFirstPY(src.charAt(i)));
            }
            
return sb.toString();
        }
 
        
private int getCnAscii(char cn) {
            
byte[] bytes = null;
            
try {
                bytes 
= (String.valueOf(cn)).getBytes("GB2312");
            } 
catch (Exception e) {
                e.printStackTrace();
            }
            
if (bytes == null || bytes.length > 2 || bytes.length <= 0) {
                
return 0;
            }
            
if (bytes.length == 1) {
                
return bytes[0];
            } 
else {
                
int hightByte = 256 + bytes[0];
                
int lowByte = 256 + bytes[1];
                
int ascii = (256 * hightByte + lowByte) - 256 * 256;
                
return ascii;
            }
        }
 
        
private String getSpellByAscii(int ascii) {
            
if (ascii > 0 && ascii < 160) { // 单字符
                return String.valueOf((char) ascii);
            }
            
if (ascii < -20319 || ascii > -10247) { // 不知道的字符
                return null;
            }
            
int first = 0;
            
int sLast = ALL_CODE.length - 1;
            
int last = ALL_CODE.length - 1;
            
int mid;
            
int temp;
            
while (true) {
                mid 
= (first + last) >> 1;
                
if (ascii == ALL_CODE[mid]) {
                    
return ALL_VALUE[mid];
                } 
else if (ascii > ALL_CODE[mid]) {
                    temp 
= mid - 1;
                    
if (temp >= 0) {
                        
if (ascii < ALL_CODE[temp]) {
                            
return ALL_VALUE[mid];
                        } 
else {
                            last 
= mid;
                        }
                    } 
else {
                        
return ALL_VALUE[0];
                    }
                } 
else {
                    
if (mid + 1 <= sLast) {
                        first 
= mid + 1;
                    } 
else {
                        
return ALL_VALUE[sLast];
                    }
                }
            }
        }
 
        
private boolean match(int i, int gb) {
            
if (gb < FIRST_TABLE[i]) {
                
return false;
            }
            
int j = i + 1;
            
// 字母Z使用了两个标签
            while (j < 26 && (FIRST_TABLE[j] == FIRST_TABLE[i])) {
                
++j;
            }
            
if (j == 26)
                
return gb <= FIRST_TABLE[j];
            
else
                
return gb < FIRST_TABLE[j];
        }
 
        
private int byte2Int(byte b) {
            
if (b < 0) {
                
return 256 + b;
            } 
else {
                
return b;
            }
        }
    }


posted on 2008-08-15 15:15 LukeW 阅读(408) 评论(0)  编辑  收藏 所属分类: Tips, Tricks, Hints & Code


只有注册用户登录后才能发表评论。


网站导航: