字符编码

字符编码笔记:ASCII,Unicode和UTF-8
字符集和字符编码
Unicode字符集和多字节字符集关系

gbk转unicode

UTF8编码如何转成GBK编码呢?
UTF8—>unicode—->GBK

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
package common;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.util.LinkedList;
import java.util.List;

public class T {
public static final Logger logger = LoggerFactory.getLogger(T.class);

public static byte[] hexStrToBytes(String hex) {
int sz = hex.length() >> 1;
byte[] buff = new byte[sz];
for (int i = 0, j = 0; i < sz; i++) {
buff[i] = (byte) (Character.digit(hex.charAt(j++), 16) << 4 |
Character.digit(hex.charAt(j++), 16));
}
return buff;
}


public static void streamToList(List list, InputStream input) {
try (BufferedReader in = new BufferedReader(new InputStreamReader(input))) {
while (in.ready()) {
list.add(in.readLine().toLowerCase().trim());
}
} catch (IOException e) {
logger.error(e.getMessage(), e);
}
}

public static String byteToString(byte b) {
StringBuilder sb = new StringBuilder();
byte[] buff = new byte[8];
for (int i = 7; i > -1; i--) {
buff[i] = (byte) (b & 01);
b >>= 1;
}
for (byte e : buff) {
sb.append(e & 0x01);
}
return sb.toString();
}

public static String utf8ToUnicode(byte[] buff) {
StringBuilder sb = new StringBuilder();
sb.append(byteToString(buff[0]).substring(buff.length, 8));
for (int i = 1; i < buff.length; i++) {
sb.append(byteToString(buff[i]).substring(2, 8));
}
int t = Integer.parseInt(sb.toString(), 2);
String s = Integer.toHexString(t);
sb.delete(0, sb.length());
for (int i = 0; i < 4 - s.length(); i++)
sb.append("0");
sb.append(s);
return sb.toString();
}

public static void main(String[] args) throws UnsupportedEncodingException {
InputStream unicodeStream = Thread.currentThread().getContextClassLoader().getResourceAsStream("unicode");
InputStream gbkStream = Thread.currentThread().getContextClassLoader().getResourceAsStream("gbk");
LinkedList<String> uni = new LinkedList<>();
LinkedList<String> gbk = new LinkedList<>();
streamToList(uni, unicodeStream);
streamToList(gbk, gbkStream);
String s = "严";
String unicode = utf8ToUnicode(s.getBytes());
int i = uni.indexOf(unicode.toLowerCase());
byte[] buff = hexStrToBytes(gbk.get(i));
System.out.println(new String(buff, "GBK"));
}
}

gbk;unicode

编码环境

1
2
3
4
5
6
7
8
@Test
public void test0() {
String s = "a严";
System.out.println(s.length());
System.out.println(s.getBytes().length);
s="a";
System.out.println(s.getBytes().length);
}

上面的文件是UTF-8编码,得到的结果是‘2,4,1’;也就是说,String的长度只跟字符有关,’a’和’严’都是一个字符。
而’a’只有1个字节,’严’有3个字节。