posts - 495,comments - 227,trackbacks - 0
http://blog.163.com/wf_shunqiziran/blog/static/176307209201258102217810/

private String getFilecharset(File sourceFile) {
        
byte[] first3Bytes = new byte[3];
        
try (BufferedInputStream bis = new BufferedInputStream(new FileInputStream(sourceFile))) {
            bis.mark(
0);
            
int read = bis.read(first3Bytes, 03);
            
if (read == -1) {
                
return "GBK"// 文件编码为 ANSI
            }
            
            
if (first3Bytes[0== (byte0xFF && first3Bytes[1== (byte0xFE) {
                
return "UTF-16LE"// 文件编码为 Unicode
            }
            
            
if (first3Bytes[0== (byte0xFE && first3Bytes[1== (byte0xFF) {
                
return "UTF-16BE"// 文件编码为 Unicode big endian
            }
            
            
if (first3Bytes[0== (byte0xEF && first3Bytes[1== (byte0xBB && first3Bytes[2== (byte0xBF) {
                
return "UTF-8"// 文件编码为 UTF-8
            }
            
            bis.reset();
            
            
while ((read = bis.read()) != -1) {
                
if (read >= 0xF0) {
                    
break;
                }
                
if (0x80 <= read && read <= 0xBF) {
                    
break;
                }
                
if (0xC0 <= read && read <= 0xDF) {
                    read 
= bis.read();
                    
if (0x80 <= read && read <= 0xBF) {
                        
// (0x80 - 0xBF),也可能在GB编码内
                        continue;
                    }
                    
                    
break;
                } 
else if (0xE0 <= read && read <= 0xEF) {// 也有可能出错,但是几率较小
                    read = bis.read();
                    
if (0x80 <= read && read <= 0xBF) {
                        read 
= bis.read();
                        
if (0x80 <= read && read <= 0xBF) {
                            
return "UTF-8";
                        }
                        
break;
                    }
                    
break;
                }
            }
        } 
catch (Exception e) {
            e.printStackTrace();
        }
        
return "GBK";
    }





最近java读取文件的时候,经常碰到中文乱码,特研究了一下java 的编码格式,在java 中
java编码与txt编码对应
java txt
unicode unicode big endian
utf-8 utf-8
utf-16 unicode
gb2312 ANSI
java 读取txt如果编码格式不对就会出现乱码格式,通过下边方法获取文本文件编码格式,然后以指定的编码读取文件,就不会出现乱码(简单测试了一下,但是也不保证100%)
private static String getFilecharset(File sourceFile) {
String charset = "GBK";
byte[] first3Bytes = new byte[3];
try {
boolean checked = false;
BufferedInputStream bis = new BufferedInputStream(new FileInputStream(sourceFile));
bis.mark(0);
int read = bis.read(first3Bytes, 0, 3);
if (read == -1) {
return charset; //文件编码为 ANSI
} else if (first3Bytes[0] == (byte) 0xFF
&& first3Bytes[1] == (byte) 0xFE) {
charset = "UTF-16LE"; //文件编码为 Unicode
checked = true;
} else if (first3Bytes[0] == (byte) 0xFE
&& first3Bytes[1] == (byte) 0xFF) {
charset = "UTF-16BE"; //文件编码为 Unicode big endian
checked = true;
} else if (first3Bytes[0] == (byte) 0xEF
&& first3Bytes[1] == (byte) 0xBB
&& first3Bytes[2] == (byte) 0xBF) {
charset = "UTF-8"; //文件编码为 UTF-8
checked = true;
}
bis.reset();
if (!checked) {
int loc = 0;
while ((read = bis.read()) != -1) {
loc++;
if (read >= 0xF0)
break;
if (0x80 <= read && read <= 0xBF) // 单独出现BF以下的,也算是GBK
break;
if (0xC0 <= read && read <= 0xDF) {
read = bis.read();
if (0x80 <= read && read <= 0xBF) // 双字节 (0xC0 - 0xDF)
// (0x80
// - 0xBF),也可能在GB编码内
continue;
else
break;
} else if (0xE0 <= read && read <= 0xEF) {// 也有可能出错,但是几率较小
read = bis.read();
if (0x80 <= read && read <= 0xBF) {
read = bis.read();
if (0x80 <= read && read <= 0xBF) {
charset = "UTF-8";
break;
} else
break;
} else
break;
}
}
}
bis.close();
} catch (Exception e) {
e.printStackTrace();
}
return charset;
}
posted on 2015-05-07 15:48 SIMONE 阅读(1505) 评论(0)  编辑  收藏

只有注册用户登录后才能发表评论。


网站导航: