Java 原生实现 URL Decode 功能（从 Byte 字节数组转 Unicode 字符）

Java 实现 URL Decode

背景：

ElasticSearch 自定义脚本 Painless 当中可以支持 Java 的部分 API，但 6.x 版本没有对 URLDecode 的支持（发现 7.0 已经支持），而分析用户搜索当中需要对 URL 进行 Decode

因而需要自己实现

public class MyURLDecode {

    public static void main(String[] args) {
        String str = "/controller/action?&s=%e8%ad%ac%e5%a6%82%e6%9c%9d%e9%9c%b2%e5%8e%bb%e6%97%a5%e8%8b%a6%e5%a4%9a&page=1&page_size=30";
        
        String decodedStr = myURLDecode(str);
        System.out.println(decodedStr);  // /controller/action?&s=譬如朝露去日苦多&page=1&page_size=30
    }

    public static String URLDecode(String s) {
            boolean needToChange = false;
            int numChars = s.length();
            StringBuilder sb = new StringBuilder();
            int i = 0;
            char c;
            byte[] bytes = null;
            String vv = "+%";
            byte vNum1 = (byte)vv.charAt(0);
            byte vNum2 = (byte)vv.charAt(1);
            while (i < numChars) {
                c = s.charAt(i);
                byte cNum = (byte)c;
                if (cNum == vNum1) {
                    sb.append(' ');
                    i++;
                    needToChange = true;
                } else if (cNum == vNum2) {
                    if (bytes == null) {
                        bytes = new byte[(numChars - i) / 3];
                    }
                    int pos = 0;
                    String hexString = "";
                    int countHex = 0;
                    while (((i + 2) < numChars) && ((byte)c == vNum2)) {
                        int v = Integer.parseInt(s.substring(i + 1, i + 3), 16);
                        hexString += s.substring(i + 1, i + 3);
                        countHex += 1;
                        if (3 == countHex) {
                            int num = Integer.parseInt(hexString, 16);
                            String bitString = Integer.toString(num, 2);
                            String unicodeString = "";
                            if ((num & 0xf0000000L) > 0) {
                                unicodeString = bitString.substring(5, 8) + bitString.substring(10, 16) + bitString.substring(18, 24) + bitString.substring(26, bitString.length());
                            } else if ((num & 0xe00000) > 0) {
                                unicodeString = bitString.substring(4, 8) + bitString.substring(10, 16) + bitString.substring(18, bitString.length());
                            } else if ((num & 0xc000) > 0) {
                                unicodeString = bitString.substring(3, 8) + bitString.substring(10, bitString.length());
                            } else {
                                unicodeString = bitString.substring(1, bitString.length());
                            }
                            char result = (char)Integer.parseInt(unicodeString, 2);
                            sb.append(result);
                            hexString = "";
                            countHex = 0;
                        }
                        bytes[pos++] = (byte)v;
                        i += 3;
                        if (i < numChars) {
                            c = s.charAt(i);
                        }
                    }
                    needToChange = true;
                } else {
                    sb.append(c);
                    i++;
                }
            }
            String ret = needToChange ? sb.toString() : s;
            return ret;
    }
}

Lucene Query

{
	"size": 0,
	"query": {
		"bool": {
		  // 筛选条件
		}
	},
	"aggs": {
		"result": {
			"terms": {
				"script": {
					"lang": "painless",
					"size": 2,
					"source": "def m = /^\\/controller\\/action.+?s=(.+?)&.*?$/.matcher(doc['nginx.access.url'].value);\nif (m.matches()) {\n        String s = m.group(1);\n        boolean needToChange = false;\n        int numChars = s.length();\n        StringBuilder sb = new StringBuilder();\n        int i = 0;\n\n        char c;\n        byte[] bytes = null;\n\n        String vv = \"+%\";\n        byte vNum1 = (byte)vv.charAt(0);\n        byte vNum2 = (byte)vv.charAt(1);\n        while (i < numChars) {\n            c = s.charAt(i);\n            byte cNum = (byte)c;\n            if (cNum == vNum1) {\n                sb.append(' ');\n                i++;\n                needToChange = true;\n            } else if (cNum == vNum2) {\n                if (bytes == null) {\n                    bytes = new byte[(numChars - i) / 3];\n                }\n                int pos = 0;\n\n                String hexString = \"\";\n                int countHex = 0;\n                while (((i + 2) < numChars) && ((byte)c == vNum2)) {\n                    int v = Integer.parseInt(s.substring(i + 1, i + 3), 16);\n                    hexString += s.substring(i + 1, i + 3);\n                    countHex += 1;\n                    if (3 == countHex) {\n                        int num = Integer.parseInt(hexString, 16);\n                        String bitString = Integer.toString(num, 2);\n                        String unicodeString = \"\";\n                        if ((num & 0xf0000000L) > 0) {\n                            unicodeString = bitString.substring(5, 8) + bitString.substring(10, 16) + bitString.substring(18, 24) + bitString.substring(26, bitString.length());\n                        } else if ((num & 0xe00000) > 0) {\n                           unicodeString = bitString.substring(4, 8) + bitString.substring(10, 16) + bitString.substring(18, bitString.length());\n                        } else if ((num & 0xc000) > 0) {\n                            unicodeString = bitString.substring(3, 8) + bitString.substring(10, bitString.length());\n                        } else {\n                            unicodeString = bitString.substring(1, bitString.length());\n                        }\n                        char result = (char)Integer.parseInt(unicodeString, 2);\n                        sb.append(result);\n\n                        hexString = \"\";\n                        countHex = 0;\n                    }\n                    bytes[pos++] = (byte)v;\n                    i += 3;\n                    if (i < numChars) {\n                        c = s.charAt(i);\n                    }\n                }\n\n                needToChange = true;\n            } else {\n                sb.append(c);\n                i++;\n            }\n        }\n\n        String ret = needToChange ? sb.toString() : s;\n        return ret.toUpperCase();\n} else {\n   return \"N/A\";\n}"
				}
			}
		}
	}
}

输出

{
“took”: 2600,
“timed_out”: false,
“_shards”: {
“total”: 278,
“successful”: 278,
“skipped”: 276,
“failed”: 0
},
“hits”: {
“total”: 476944,
“max_score”: 0,
“hits”: []
},
“aggregations”: {
“results”: {
“doc_count_error_upper_bound”: 1419,
“sum_other_doc_count”: 359784,
“buckets”: [
{
“key”: “秋以为期”,
“doc_count”: 6514
},
{
“key”: “原野苍茫”,
“doc_count”: 4704
}
]
}
}
}

参考

java.net.URLDecoder.decode() 方法（原有方法中 new String() 构造函数 ElasticSearch 只支持无参的形式，因此需要手工进行从字节数组转为 Unicode 字符）
https://blog.csdn.net/hezh1994/article/details/78899683
https://www.elastic.co/guide/en/elasticsearch/painless/6.7/painless-api-reference.html

xchenhao

发布了40 篇原创文章 · 获赞 14 · 访问量 1万+

私信关注

Java 原生实现 URL Decode 功能（从 Byte 字节数组转 Unicode 字符）

Java 实现 URL Decode

Lucene Query

参考

猜你喜欢