!11 Support binary strings preserve UTF-8 and UTF-16 errors
From: @ultra_planet Reviewed-by: @openeuler-basic Signed-off-by: @openeuler-basic
This commit is contained in:
commit
7ff876841b
616
Binary-strings-preserve-UTF-8-and-UTF-16-errors.patch
Normal file
616
Binary-strings-preserve-UTF-8-and-UTF-16-errors.patch
Normal file
@ -0,0 +1,616 @@
|
|||||||
|
From b2384ea878f484c48419fc0ec30380d0a5ffe3ce Mon Sep 17 00:00:00 2001
|
||||||
|
From: Max Zerzouri <maxdamantus@gmail.com>
|
||||||
|
Date: Sat, 15 May 2021 08:32:27 +0000
|
||||||
|
Subject: [PATCH] Binary strings: preserve UTF-8 and UTF-16 errors
|
||||||
|
|
||||||
|
The internal string representation is changed from UTF-8 with replacement
|
||||||
|
characters to a modified form of "WTF-8" that is able to distinctly encode
|
||||||
|
UTF-8 errors and UTF-16 errors.
|
||||||
|
|
||||||
|
This handles UTF-8 errors in raw string inputs and handles UTF-8 and UTF-16
|
||||||
|
errors in JSON input. UTF-16 errors (using "\uXXXX") and UTF-8 errors (using
|
||||||
|
the original raw bytes) are maintained when emitting JSON. When emitting raw
|
||||||
|
strings, UTF-8 errors are maintained and UTF-16 errors are converted into
|
||||||
|
replacement characters.
|
||||||
|
---
|
||||||
|
scripts/gen_utf8_tables.py | 3 +-
|
||||||
|
src/jv.c | 28 ++++++------
|
||||||
|
src/jv.h | 1 +
|
||||||
|
src/jv_parse.c | 77 ++++++++++++++++++++++-----------
|
||||||
|
src/jv_print.c | 26 +++++++++++-
|
||||||
|
src/jv_unicode.c | 87 ++++++++++++++++++++++++++++++++++----
|
||||||
|
src/jv_unicode.h | 11 +++++
|
||||||
|
src/jv_utf8_tables.h | 4 +-
|
||||||
|
src/main.c | 29 ++++++++++++-
|
||||||
|
tests/jq.test | 5 +++
|
||||||
|
tests/shtest | 9 ++++
|
||||||
|
11 files changed, 228 insertions(+), 52 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/scripts/gen_utf8_tables.py b/scripts/gen_utf8_tables.py
|
||||||
|
index 6fe0a53..7706462 100644
|
||||||
|
--- a/scripts/gen_utf8_tables.py
|
||||||
|
+++ b/scripts/gen_utf8_tables.py
|
||||||
|
@@ -16,8 +16,7 @@ def print_table(type, name, t):
|
||||||
|
def utf8info(c):
|
||||||
|
if c < 0x80: return 1, mask(7)
|
||||||
|
if 0x80 <= c <= 0xBF: return 255, mask(6)
|
||||||
|
- if 0xC0 <= c <= 0xC1: return 0, 0
|
||||||
|
- if 0xC2 <= c <= 0xDF: return 2, mask(5)
|
||||||
|
+ if 0xC0 <= c <= 0xDF: return 2, mask(5)
|
||||||
|
if 0xE0 <= c <= 0xEF: return 3, mask(4)
|
||||||
|
if 0xF0 <= c <= 0xF4: return 4, mask(3)
|
||||||
|
if 0xF4 <= c <= 0xFF: return 0, 0
|
||||||
|
diff --git a/src/jv.c b/src/jv.c
|
||||||
|
index 1f1029e..e979cc6 100644
|
||||||
|
--- a/src/jv.c
|
||||||
|
+++ b/src/jv.c
|
||||||
|
@@ -452,20 +452,24 @@ static jvp_string* jvp_string_alloc(uint32_t size) {
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
-/* Copy a UTF8 string, replacing all badly encoded points with U+FFFD */
|
||||||
|
+/* Copy a UTF8 string, using WTF-8b to replace all UTF-8 errors */
|
||||||
|
static jv jvp_string_copy_replace_bad(const char* data, uint32_t length) {
|
||||||
|
const char* end = data + length;
|
||||||
|
const char* i = data;
|
||||||
|
const char* cstart;
|
||||||
|
|
||||||
|
- uint32_t maxlength = length * 3 + 1; // worst case: all bad bytes, each becomes a 3-byte U+FFFD
|
||||||
|
+ uint32_t maxlength = length * 2 + 1; // worst case: all bad bytes, each becomes a 2-byte overlong U+XX
|
||||||
|
jvp_string* s = jvp_string_alloc(maxlength);
|
||||||
|
char* out = s->data;
|
||||||
|
int c = 0;
|
||||||
|
|
||||||
|
- while ((i = jvp_utf8_next((cstart = i), end, &c))) {
|
||||||
|
+ while ((i = jvp_utf8_extended_next((cstart = i), end, 0, &c))) {
|
||||||
|
if (c == -1) {
|
||||||
|
- c = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER
|
||||||
|
+ int error = (unsigned char)*cstart;
|
||||||
|
+ assert(error >= 0x80 && error <= 0xFF);
|
||||||
|
+ c = -error;
|
||||||
|
+ /* Ensure each UTF-8 error byte is consumed separately */
|
||||||
|
+ i = cstart + 1;
|
||||||
|
}
|
||||||
|
out += jvp_utf8_encode(c, out);
|
||||||
|
assert(out < s->data + maxlength);
|
||||||
|
@@ -477,8 +481,8 @@ static jv jvp_string_copy_replace_bad(const char* data, uint32_t length) {
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
-/* Assumes valid UTF8 */
|
||||||
|
-static jv jvp_string_new(const char* data, uint32_t length) {
|
||||||
|
+/* Assumes valid WTF-8b */
|
||||||
|
+jv jv_string_extended_sized(const char* data, int length) {
|
||||||
|
jvp_string* s = jvp_string_alloc(length);
|
||||||
|
s->length_hashed = length << 1;
|
||||||
|
if (data != NULL)
|
||||||
|
@@ -618,7 +622,7 @@ static int jvp_string_equal(jv a, jv b) {
|
||||||
|
jv jv_string_sized(const char* str, int len) {
|
||||||
|
return
|
||||||
|
jvp_utf8_is_valid(str, str+len) ?
|
||||||
|
- jvp_string_new(str, len) :
|
||||||
|
+ jv_string_extended_sized(str, len) :
|
||||||
|
jvp_string_copy_replace_bad(str, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
@@ -682,14 +686,14 @@ jv jv_string_split(jv j, jv sep) {
|
||||||
|
|
||||||
|
if (seplen == 0) {
|
||||||
|
int c;
|
||||||
|
- while ((jstr = jvp_utf8_next(jstr, jend, &c)))
|
||||||
|
+ while ((jstr = jvp_utf8_extended_next(jstr, jend, JVP_UTF8_ERRORS_ALL, &c)))
|
||||||
|
a = jv_array_append(a, jv_string_append_codepoint(jv_string(""), c));
|
||||||
|
} else {
|
||||||
|
for (p = jstr; p < jend; p = s + seplen) {
|
||||||
|
s = _jq_memmem(p, jend - p, sepstr, seplen);
|
||||||
|
if (s == NULL)
|
||||||
|
s = jend;
|
||||||
|
- a = jv_array_append(a, jv_string_sized(p, s - p));
|
||||||
|
+ a = jv_array_append(a, jv_string_extended_sized(p, s - p));
|
||||||
|
// Add an empty string to denote that j ends on a sep
|
||||||
|
if (s + seplen == jend && seplen != 0)
|
||||||
|
a = jv_array_append(a, jv_string(""));
|
||||||
|
@@ -760,7 +764,7 @@ jv jv_string_slice(jv j, int start, int end) {
|
||||||
|
|
||||||
|
/* Look for byte offset corresponding to start codepoints */
|
||||||
|
for (p = s, i = 0; i < start; i++) {
|
||||||
|
- p = jvp_utf8_next(p, s + len, &c);
|
||||||
|
+ p = jvp_utf8_extended_next(p, s + len, JVP_UTF8_ERRORS_ALL, &c);
|
||||||
|
if (p == NULL) {
|
||||||
|
jv_free(j);
|
||||||
|
return jv_string_empty(16);
|
||||||
|
@@ -772,7 +776,7 @@ jv jv_string_slice(jv j, int start, int end) {
|
||||||
|
}
|
||||||
|
/* Look for byte offset corresponding to end codepoints */
|
||||||
|
for (e = p; e != NULL && i < end; i++) {
|
||||||
|
- e = jvp_utf8_next(e, s + len, &c);
|
||||||
|
+ e = jvp_utf8_extended_next(e, s + len, JVP_UTF8_ERRORS_ALL, &c);
|
||||||
|
if (e == NULL) {
|
||||||
|
e = s + len;
|
||||||
|
break;
|
||||||
|
@@ -790,7 +794,7 @@ jv jv_string_slice(jv j, int start, int end) {
|
||||||
|
* memory like a drunken navy programmer. There's probably nothing we
|
||||||
|
* can do about it.
|
||||||
|
*/
|
||||||
|
- res = jv_string_sized(p, e - p);
|
||||||
|
+ res = jv_string_extended_sized(p, e - p);
|
||||||
|
jv_free(j);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
diff --git a/src/jv.h b/src/jv.h
|
||||||
|
index d111c80..2aed1ae 100644
|
||||||
|
--- a/src/jv.h
|
||||||
|
+++ b/src/jv.h
|
||||||
|
@@ -104,6 +104,7 @@ jv jv_array_indexes(jv, jv);
|
||||||
|
|
||||||
|
jv jv_string(const char*);
|
||||||
|
jv jv_string_sized(const char*, int);
|
||||||
|
+jv jv_string_extended_sized(const char*, int);
|
||||||
|
jv jv_string_empty(int len);
|
||||||
|
int jv_string_length_bytes(jv);
|
||||||
|
int jv_string_length_codepoints(jv);
|
||||||
|
diff --git a/src/jv_parse.c b/src/jv_parse.c
|
||||||
|
index 51ad9f0..194efaf 100644
|
||||||
|
--- a/src/jv_parse.c
|
||||||
|
+++ b/src/jv_parse.c
|
||||||
|
@@ -397,7 +397,7 @@ static void tokenadd(struct jv_parser* p, char c) {
|
||||||
|
p->tokenbuf[p->tokenpos++] = c;
|
||||||
|
}
|
||||||
|
|
||||||
|
-static int unhex4(char* hex) {
|
||||||
|
+static int unhex4(const char* hex) {
|
||||||
|
int r = 0;
|
||||||
|
for (int i=0; i<4; i++) {
|
||||||
|
char c = *hex++;
|
||||||
|
@@ -413,15 +413,19 @@ static int unhex4(char* hex) {
|
||||||
|
}
|
||||||
|
|
||||||
|
static pfunc found_string(struct jv_parser* p) {
|
||||||
|
- char* in = p->tokenbuf;
|
||||||
|
- char* out = p->tokenbuf;
|
||||||
|
- char* end = p->tokenbuf + p->tokenpos;
|
||||||
|
-
|
||||||
|
- while (in < end) {
|
||||||
|
- char c = *in++;
|
||||||
|
+ const char* in = p->tokenbuf;
|
||||||
|
+ // start by writing to tokenbuf, only allocate in case that output size is greater than input size (possible only when input has UTF-8 errors)
|
||||||
|
+ char* newbuf = NULL;
|
||||||
|
+ char* buf = p->tokenbuf;
|
||||||
|
+ char* out = buf;
|
||||||
|
+ const char* end = p->tokenbuf + p->tokenpos;
|
||||||
|
+ const char* cstart;
|
||||||
|
+ int c;
|
||||||
|
+
|
||||||
|
+ while ((in = jvp_utf8_extended_next((cstart = in), end, 0, &c))) {
|
||||||
|
if (c == '\\') {
|
||||||
|
if (in >= end)
|
||||||
|
- return "Expected escape character at end of string";
|
||||||
|
+ return jv_mem_free(newbuf), "Expected escape character at end of string";
|
||||||
|
c = *in++;
|
||||||
|
switch (c) {
|
||||||
|
case '\\':
|
||||||
|
@@ -436,38 +440,61 @@ static pfunc found_string(struct jv_parser* p) {
|
||||||
|
case 'u':
|
||||||
|
/* ahh, the complicated case */
|
||||||
|
if (in + 4 > end)
|
||||||
|
- return "Invalid \\uXXXX escape";
|
||||||
|
+ return jv_mem_free(newbuf), "Invalid \\uXXXX escape";
|
||||||
|
int hexvalue = unhex4(in);
|
||||||
|
if (hexvalue < 0)
|
||||||
|
- return "Invalid characters in \\uXXXX escape";
|
||||||
|
+ return jv_mem_free(newbuf), "Invalid characters in \\uXXXX escape";
|
||||||
|
unsigned long codepoint = (unsigned long)hexvalue;
|
||||||
|
in += 4;
|
||||||
|
+ // leading surrogate
|
||||||
|
if (0xD800 <= codepoint && codepoint <= 0xDBFF) {
|
||||||
|
- /* who thought UTF-16 surrogate pairs were a good idea? */
|
||||||
|
- if (in + 6 > end || in[0] != '\\' || in[1] != 'u')
|
||||||
|
- return "Invalid \\uXXXX\\uXXXX surrogate pair escape";
|
||||||
|
- unsigned long surrogate = unhex4(in+2);
|
||||||
|
- if (!(0xDC00 <= surrogate && surrogate <= 0xDFFF))
|
||||||
|
- return "Invalid \\uXXXX\\uXXXX surrogate pair escape";
|
||||||
|
- in += 6;
|
||||||
|
- codepoint = 0x10000 + (((codepoint - 0xD800) << 10)
|
||||||
|
- |(surrogate - 0xDC00));
|
||||||
|
+ // look ahead for trailing surrogate and decode as UTF-16, otherwise encode this lone surrogate as WTF-8
|
||||||
|
+ if (in + 6 <= end && in[0] == '\\' && in[1] == 'u') {
|
||||||
|
+ unsigned long surrogate = unhex4(in+2);
|
||||||
|
+ if (0xDC00 <= surrogate && surrogate <= 0xDFFF) {
|
||||||
|
+ in += 6;
|
||||||
|
+ codepoint = 0x10000 + (((codepoint - 0xD800) << 10)
|
||||||
|
+ |(surrogate - 0xDC00));
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
}
|
||||||
|
- if (codepoint > 0x10FFFF)
|
||||||
|
- codepoint = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER
|
||||||
|
+ // UTF-16 surrogates can not encode a greater codepoint
|
||||||
|
+ assert(codepoint <= 0x10FFFF);
|
||||||
|
+ // NOTE: a leading or trailing surrogate here (0xD800 <= codepoint && codepoint <= 0xDFFF) is encoded as WTF-8
|
||||||
|
out += jvp_utf8_encode(codepoint, out);
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
- return "Invalid escape";
|
||||||
|
+ return jv_mem_free(newbuf), "Invalid escape";
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (c > 0 && c < 0x001f)
|
||||||
|
- return "Invalid string: control characters from U+0000 through U+001F must be escaped";
|
||||||
|
- *out++ = c;
|
||||||
|
+ return jv_mem_free(newbuf), "Invalid string: control characters from U+0000 through U+001F must be escaped";
|
||||||
|
+ if (c == -1) {
|
||||||
|
+ int error = (unsigned char)*cstart;
|
||||||
|
+ assert(error >= 0x80 && error <= 0xFF);
|
||||||
|
+ c = -error;
|
||||||
|
+ /* Ensure each UTF-8 error byte is consumed separately */
|
||||||
|
+ const int wtf8_length = 2;
|
||||||
|
+ assert(jvp_utf8_encode_length(c) == wtf8_length);
|
||||||
|
+ in = cstart + 1;
|
||||||
|
+ if (newbuf == NULL && out + wtf8_length > in) {
|
||||||
|
+ /* Output is about to overflow input, move output to temporary buffer */
|
||||||
|
+ int current_size = out - p->tokenbuf;
|
||||||
|
+ int remaining = end - cstart;
|
||||||
|
+ newbuf = jv_mem_alloc(current_size + remaining * wtf8_length); // worst case: all remaining bad bytes, each becomes a 2-byte overlong U+XX
|
||||||
|
+ memcpy(newbuf, buf, current_size);
|
||||||
|
+ buf = newbuf;
|
||||||
|
+ out = buf + current_size;
|
||||||
|
+ }
|
||||||
|
+ } else
|
||||||
|
+ assert(jvp_utf8_encode_length(c) == in - cstart);
|
||||||
|
+ out += jvp_utf8_encode(c, out);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
- TRY(value(p, jv_string_sized(p->tokenbuf, out - p->tokenbuf)));
|
||||||
|
+ jv v = jv_string_extended_sized(buf, out - buf);
|
||||||
|
+ jv_mem_free(newbuf);
|
||||||
|
+ TRY(value(p, v));
|
||||||
|
p->tokenpos = 0;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
diff --git a/src/jv_print.c b/src/jv_print.c
|
||||||
|
index 5ebc01e..dfa1f05 100644
|
||||||
|
--- a/src/jv_print.c
|
||||||
|
+++ b/src/jv_print.c
|
||||||
|
@@ -98,6 +98,16 @@ static void put_char(char c, FILE* fout, jv* strout, int T) {
|
||||||
|
put_buf(&c, 1, fout, strout, T);
|
||||||
|
}
|
||||||
|
|
||||||
|
+static void put_invalid_utf8_byte(int c, FILE* fout, jv* strout, int T) {
|
||||||
|
+ assert(c >= 0x80 && c <= 0xFF);
|
||||||
|
+ if (strout) {
|
||||||
|
+ // encode as an invalid UTF-8 byte in output
|
||||||
|
+ *strout = jv_string_append_codepoint(*strout, -c);
|
||||||
|
+ } else {
|
||||||
|
+ put_char(c, fout, strout, T);
|
||||||
|
+ }
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
static void put_str(const char* s, FILE* fout, jv* strout, int T) {
|
||||||
|
put_buf(s, strlen(s), fout, strout, T);
|
||||||
|
}
|
||||||
|
@@ -121,7 +131,7 @@ static void jvp_dump_string(jv str, int ascii_only, FILE* F, jv* S, int T) {
|
||||||
|
int c = 0;
|
||||||
|
char buf[32];
|
||||||
|
put_char('"', F, S, T);
|
||||||
|
- while ((i = jvp_utf8_next((cstart = i), end, &c))) {
|
||||||
|
+ while ((i = jvp_utf8_extended_next((cstart = i), end, JVP_UTF8_ERRORS_ALL, &c))) {
|
||||||
|
assert(c != -1);
|
||||||
|
int unicode_escape = 0;
|
||||||
|
if (0x20 <= c && c <= 0x7E) {
|
||||||
|
@@ -130,6 +140,17 @@ static void jvp_dump_string(jv str, int ascii_only, FILE* F, jv* S, int T) {
|
||||||
|
put_char('\\', F, S, T);
|
||||||
|
}
|
||||||
|
put_char(c, F, S, T);
|
||||||
|
+ } else if (c >= -0xFF && c <= -0x80) {
|
||||||
|
+ // Invalid UTF-8 byte
|
||||||
|
+ if (ascii_only) {
|
||||||
|
+ // refusing to emit invalid UTF-8
|
||||||
|
+ // TODO: convince the world to adopt a "\xXX" notation for JSON?
|
||||||
|
+ c = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER
|
||||||
|
+ unicode_escape = 1;
|
||||||
|
+ } else {
|
||||||
|
+ // pass through
|
||||||
|
+ put_invalid_utf8_byte(-c, F, S, T);
|
||||||
|
+ }
|
||||||
|
} else if (c < 0x20 || c == 0x7F) {
|
||||||
|
// ASCII control character
|
||||||
|
switch (c) {
|
||||||
|
@@ -160,6 +181,9 @@ static void jvp_dump_string(jv str, int ascii_only, FILE* F, jv* S, int T) {
|
||||||
|
} else {
|
||||||
|
if (ascii_only) {
|
||||||
|
unicode_escape = 1;
|
||||||
|
+ } else if (c >= 0xD800 && c <= 0xDFFF) {
|
||||||
|
+ // lone surrogate; can't be encoded to UTF-8
|
||||||
|
+ unicode_escape = 1;
|
||||||
|
} else {
|
||||||
|
put_buf(cstart, i - cstart, F, S, T);
|
||||||
|
}
|
||||||
|
diff --git a/src/jv_unicode.c b/src/jv_unicode.c
|
||||||
|
index d197349..8c47536 100644
|
||||||
|
--- a/src/jv_unicode.c
|
||||||
|
+++ b/src/jv_unicode.c
|
||||||
|
@@ -27,6 +27,56 @@ const char* jvp_utf8_backtrack(const char* start, const char* min, int *missing_
|
||||||
|
}
|
||||||
|
|
||||||
|
const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) {
|
||||||
|
+ return jvp_utf8_extended_next(in, end, JVP_UTF8_REPLACE, codepoint_ret);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+/*
|
||||||
|
+ The internal representation of jv strings uses an encoding that is hereby
|
||||||
|
+ referred to as "WTF-8b" (until someone demonstrates use of another term to
|
||||||
|
+ refer to the same encoding).
|
||||||
|
+
|
||||||
|
+ WTF-8b is an extension of WTF-8, which is an extension of UTF-8. Any sequence
|
||||||
|
+ of Unicode scalar values is represented by the same bytes in UTF-8, WTF-8 and
|
||||||
|
+ WTF-8b, therefore any well-formed UTF-8 string is interpreted as the same
|
||||||
|
+ sequence of Unicode scalar values (roughly, code points) in WTF-8b.
|
||||||
|
+
|
||||||
|
+ Like WTF-8, WTF-8b is able to encode UTF-16 errors (lone surrogates) using
|
||||||
|
+ the "generalized UTF-8" representation of code points between U+D800 and
|
||||||
|
+ U+DFFF. These errors occur in JSON terms such as:
|
||||||
|
+ "_\uD8AB_\uDBCD_"
|
||||||
|
+
|
||||||
|
+ Unlike WTF-8, WTF-8b is also able to encode UTF-8 errors (bytes 0x80 to 0xFF
|
||||||
|
+ that are not part of a valid UTF-8 sequence) using the first 128 "overlong"
|
||||||
|
+ codings (unused 2-byte representations of U+00 to U+7F). These errors can
|
||||||
|
+ occur in any byte stream that is interpreted as UTF-8, for example:
|
||||||
|
+ "\xED\xA2\xAB"
|
||||||
|
+ The above example is in fact the WTF-8b (and WTF-8) encoding for the lone
|
||||||
|
+ UTF-16 surrogate "\uD8AB", which demonstrates the need for a distinct
|
||||||
|
+ encoding of UTF-8 errors. If a distinction were not made, then "\xED\xA2\xAB"
|
||||||
|
+ and "\uD8AB" would be interpreted as the same string, so at least one of the
|
||||||
|
+ forms would not be preserved when printed as JSON output.
|
||||||
|
+
|
||||||
|
+ It should also be noted that the process of converting from invalid UTF-8 to
|
||||||
|
+ WTF-8b is not (and can not be) idempotent, since the "generalised UTF-8"
|
||||||
|
+ representation of UTF-16 surrogates are intentionally not able to be
|
||||||
|
+ generated from invalid UTF-8, only through some other means (usually "\uXXXX"
|
||||||
|
+ notation).
|
||||||
|
+
|
||||||
|
+ Each UTF-16 error is encoded as 3 WTF-8b (or WTF-8) bytes.
|
||||||
|
+ Each UTF-8 error is encoded as 2 WTF-8b bytes.
|
||||||
|
+
|
||||||
|
+ When iterating over code points using `JVP_UTF8_ERRORS_UTF16`, encoded UTF-16
|
||||||
|
+ errors are emitted in the form of code points in the range U+D800 to U+DFFF.
|
||||||
|
+ These code points can be reencoded as usual using `jvp_utf8_encode`.
|
||||||
|
+
|
||||||
|
+ When iterating over code points using `JVP_UTF8_ERRORS_UTF8`, encoded UTF-8
|
||||||
|
+ errors are emitted in the form of code points in the negative range -0x80 to
|
||||||
|
+ -0xFF. These negative code points can be negated to determine the original
|
||||||
|
+ error bytes. These code points can be reencoded as usual using
|
||||||
|
+ `jvp_utf8_encode`.
|
||||||
|
+*/
|
||||||
|
+
|
||||||
|
+const char* jvp_utf8_extended_next(const char* in, const char* end, enum jvp_utf8_flags flags, int* codepoint_ret) {
|
||||||
|
assert(in <= end);
|
||||||
|
if (in == end) {
|
||||||
|
return 0;
|
||||||
|
@@ -40,9 +90,11 @@ const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) {
|
||||||
|
length = 1;
|
||||||
|
} else if (length == 0 || length == UTF8_CONTINUATION_BYTE) {
|
||||||
|
/* Bad single byte - either an invalid byte or an out-of-place continuation byte */
|
||||||
|
+ if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: bad single byte");
|
||||||
|
length = 1;
|
||||||
|
} else if (in + length > end) {
|
||||||
|
/* String ends before UTF8 sequence ends */
|
||||||
|
+ if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: underrun");
|
||||||
|
length = end - in;
|
||||||
|
} else {
|
||||||
|
codepoint = ((unsigned)in[0]) & utf8_coding_bits[first];
|
||||||
|
@@ -50,6 +102,7 @@ const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) {
|
||||||
|
unsigned ch = (unsigned char)in[i];
|
||||||
|
if (utf8_coding_length[ch] != UTF8_CONTINUATION_BYTE){
|
||||||
|
/* Invalid UTF8 sequence - not followed by the right number of continuation bytes */
|
||||||
|
+ if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: wrong bytes");
|
||||||
|
codepoint = -1;
|
||||||
|
length = i;
|
||||||
|
break;
|
||||||
|
@@ -58,17 +111,29 @@ const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) {
|
||||||
|
}
|
||||||
|
if (codepoint < utf8_first_codepoint[length]) {
|
||||||
|
/* Overlong UTF8 sequence */
|
||||||
|
- codepoint = -1;
|
||||||
|
+ if ((flags & JVP_UTF8_ERRORS_UTF8) && 0x00 <= codepoint && codepoint <= 0x7F) {
|
||||||
|
+ /* UTF-8 error is emitted as a negative codepoint */
|
||||||
|
+ codepoint = -(codepoint + 0x80);
|
||||||
|
+ } else {
|
||||||
|
+ if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: overlong");
|
||||||
|
+ codepoint = -1;
|
||||||
|
+ }
|
||||||
|
}
|
||||||
|
if (0xD800 <= codepoint && codepoint <= 0xDFFF) {
|
||||||
|
- /* Surrogate codepoints can't be encoded in UTF8 */
|
||||||
|
- codepoint = -1;
|
||||||
|
+ /* Surrogate codepoints are allowed in WTF-8/WTF-8b */
|
||||||
|
+ if (!(flags & JVP_UTF8_ERRORS_UTF16)) {
|
||||||
|
+ /* Surrogate codepoints can't be encoded in UTF8 */
|
||||||
|
+ codepoint = -1;
|
||||||
|
+ }
|
||||||
|
}
|
||||||
|
if (codepoint > 0x10FFFF) {
|
||||||
|
/* Outside Unicode range */
|
||||||
|
+ if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: out of range");
|
||||||
|
codepoint = -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
+ if (codepoint == -1 && (flags & JVP_UTF8_REPLACE))
|
||||||
|
+ codepoint = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER
|
||||||
|
assert(length > 0);
|
||||||
|
*codepoint_ret = codepoint;
|
||||||
|
return in + length;
|
||||||
|
@@ -76,7 +141,7 @@ const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) {
|
||||||
|
|
||||||
|
int jvp_utf8_is_valid(const char* in, const char* end) {
|
||||||
|
int codepoint;
|
||||||
|
- while ((in = jvp_utf8_next(in, end, &codepoint))) {
|
||||||
|
+ while ((in = jvp_utf8_extended_next(in, end, 0, &codepoint))) {
|
||||||
|
if (codepoint == -1) return 0;
|
||||||
|
}
|
||||||
|
return 1;
|
||||||
|
@@ -91,20 +156,24 @@ int jvp_utf8_decode_length(char startchar) {
|
||||||
|
}
|
||||||
|
|
||||||
|
int jvp_utf8_encode_length(int codepoint) {
|
||||||
|
- if (codepoint <= 0x7F) return 1;
|
||||||
|
+ if (codepoint >= 0 && codepoint <= 0x7F) return 1;
|
||||||
|
else if (codepoint <= 0x7FF) return 2;
|
||||||
|
else if (codepoint <= 0xFFFF) return 3;
|
||||||
|
else return 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
int jvp_utf8_encode(int codepoint, char* out) {
|
||||||
|
- assert(codepoint >= 0 && codepoint <= 0x10FFFF);
|
||||||
|
+ assert((codepoint >= 0 && codepoint <= 0x10FFFF) || (codepoint >= -0xFF && codepoint <= -0x80));
|
||||||
|
char* start = out;
|
||||||
|
- if (codepoint <= 0x7F) {
|
||||||
|
+ if (codepoint >= 0 && codepoint <= 0x7F) {
|
||||||
|
*out++ = codepoint;
|
||||||
|
} else if (codepoint <= 0x7FF) {
|
||||||
|
- *out++ = 0xC0 + ((codepoint & 0x7C0) >> 6);
|
||||||
|
- *out++ = 0x80 + ((codepoint & 0x03F));
|
||||||
|
+ // encode UTF-8 errors as overlong representations of U+00 to U+7F
|
||||||
|
+ int cp = codepoint >= -0xFF && codepoint <= -0x80?
|
||||||
|
+ -codepoint - 0x80 :
|
||||||
|
+ codepoint;
|
||||||
|
+ *out++ = 0xC0 + ((cp & 0x7C0) >> 6);
|
||||||
|
+ *out++ = 0x80 + ((cp & 0x03F));
|
||||||
|
} else if(codepoint <= 0xFFFF) {
|
||||||
|
*out++ = 0xE0 + ((codepoint & 0xF000) >> 12);
|
||||||
|
*out++ = 0x80 + ((codepoint & 0x0FC0) >> 6);
|
||||||
|
diff --git a/src/jv_unicode.h b/src/jv_unicode.h
|
||||||
|
index 558721a..37c7fc0 100644
|
||||||
|
--- a/src/jv_unicode.h
|
||||||
|
+++ b/src/jv_unicode.h
|
||||||
|
@@ -1,7 +1,18 @@
|
||||||
|
#ifndef JV_UNICODE_H
|
||||||
|
#define JV_UNICODE_H
|
||||||
|
|
||||||
|
+enum jvp_utf8_flags {
|
||||||
|
+ /* Emit replacement character instead of -1 for errors */
|
||||||
|
+ JVP_UTF8_REPLACE = 1,
|
||||||
|
+ /* Treat input as WTF-8b, emit 0xD800 to 0xDFFF to denote encoded UTF-16 errors */
|
||||||
|
+ JVP_UTF8_ERRORS_UTF16 = 2,
|
||||||
|
+ /* Treat input as WTF-8b, emit -0x80 to -0xFF to denote encoded UTF-8 errors */
|
||||||
|
+ JVP_UTF8_ERRORS_UTF8 = 4,
|
||||||
|
+ JVP_UTF8_ERRORS_ALL = JVP_UTF8_ERRORS_UTF16 | JVP_UTF8_ERRORS_UTF8
|
||||||
|
+};
|
||||||
|
+
|
||||||
|
const char* jvp_utf8_backtrack(const char* start, const char* min, int *missing_bytes);
|
||||||
|
+const char* jvp_utf8_extended_next(const char* in, const char* end, enum jvp_utf8_flags flags, int* codepoint);
|
||||||
|
const char* jvp_utf8_next(const char* in, const char* end, int* codepoint);
|
||||||
|
int jvp_utf8_is_valid(const char* in, const char* end);
|
||||||
|
|
||||||
|
diff --git a/src/jv_utf8_tables.h b/src/jv_utf8_tables.h
|
||||||
|
index f1a4252..7c68749 100644
|
||||||
|
--- a/src/jv_utf8_tables.h
|
||||||
|
+++ b/src/jv_utf8_tables.h
|
||||||
|
@@ -12,7 +12,7 @@ static const unsigned char utf8_coding_length[] =
|
||||||
|
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||||
|
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||||
|
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||||
|
- 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
|
||||||
|
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
|
||||||
|
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
|
||||||
|
0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
|
||||||
|
0x04, 0x04, 0x04, 0x04, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
|
||||||
|
@@ -29,7 +29,7 @@ static const unsigned char utf8_coding_bits[] =
|
||||||
|
0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f,
|
||||||
|
0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f,
|
||||||
|
0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f,
|
||||||
|
- 0x00, 0x00, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
|
||||||
|
+ 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
|
||||||
|
0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
|
||||||
|
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
|
||||||
|
0x07, 0x07, 0x07, 0x07, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
|
||||||
|
diff --git a/src/main.c b/src/main.c
|
||||||
|
index b154689..5fa5c4f 100644
|
||||||
|
--- a/src/main.c
|
||||||
|
+++ b/src/main.c
|
||||||
|
@@ -30,6 +30,7 @@
|
||||||
|
#include "jv.h"
|
||||||
|
#include "jq.h"
|
||||||
|
#include "jv_alloc.h"
|
||||||
|
+#include "jv_unicode.h"
|
||||||
|
#include "util.h"
|
||||||
|
#include "src/version.h"
|
||||||
|
|
||||||
|
@@ -161,6 +162,30 @@ static const char *skip_shebang(const char *p) {
|
||||||
|
return n+1;
|
||||||
|
}
|
||||||
|
|
||||||
|
+static void jvp_dump_raw_string(const char* start, const char* end, FILE* f) {
|
||||||
|
+ static const unsigned char UTF8_REPLACEMENT[] = {0xEF,0xBF,0xBD}; // U+FFFD REPLACEMENT CHARACTER
|
||||||
|
+
|
||||||
|
+ const char* i = start;
|
||||||
|
+ const char* cstart;
|
||||||
|
+ int c;
|
||||||
|
+
|
||||||
|
+ while ((i = jvp_utf8_extended_next((cstart = i), end, JVP_UTF8_ERRORS_ALL, &c))) {
|
||||||
|
+ if (c >= -0xFF && c <= -0x80) {
|
||||||
|
+ // invalid UTF-8 byte; pass through
|
||||||
|
+ fwrite(start, 1, cstart - start, f);
|
||||||
|
+ start = i;
|
||||||
|
+ fputc(-c, f);
|
||||||
|
+ } else if ((c >= 0xD800 && c <= 0xDFFF) || c == -1) {
|
||||||
|
+ // lone surrugate; can't be encoded to UTF-8
|
||||||
|
+ fwrite(start, 1, cstart - start, f);
|
||||||
|
+ start = i;
|
||||||
|
+ fwrite(UTF8_REPLACEMENT, 1, sizeof(UTF8_REPLACEMENT), f);
|
||||||
|
+ } else
|
||||||
|
+ continue;
|
||||||
|
+ }
|
||||||
|
+ fwrite(start, 1, end - start, f);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
static int process(jq_state *jq, jv value, int flags, int dumpopts) {
|
||||||
|
int ret = 14; // No valid results && -e -> exit(4)
|
||||||
|
jq_start(jq, value, flags);
|
||||||
|
@@ -170,7 +195,9 @@ static int process(jq_state *jq, jv value, int flags, int dumpopts) {
|
||||||
|
if (options & ASCII_OUTPUT) {
|
||||||
|
jv_dumpf(result, stdout, JV_PRINT_ASCII);
|
||||||
|
} else {
|
||||||
|
- fwrite(jv_string_value(result), 1, jv_string_length_bytes(jv_copy(result)), stdout);
|
||||||
|
+ const char *start = jv_string_value(result);
|
||||||
|
+ const char *end = start + jv_string_length_bytes(jv_copy(result));
|
||||||
|
+ jvp_dump_raw_string(start, end, stdout);
|
||||||
|
}
|
||||||
|
ret = 0;
|
||||||
|
jv_free(result);
|
||||||
|
diff --git a/tests/jq.test b/tests/jq.test
|
||||||
|
index 7e2dd43..c882fd2 100644
|
||||||
|
--- a/tests/jq.test
|
||||||
|
+++ b/tests/jq.test
|
||||||
|
@@ -57,6 +57,11 @@ null
|
||||||
|
"Aa\r\n\t\b\f\u03bc"
|
||||||
|
"Aa\u000d\u000a\u0009\u0008\u000c\u03bc"
|
||||||
|
|
||||||
|
+# Check that unpaired surrogates are preserved in output
|
||||||
|
+"\u2200\ud800\u2203\udc00\u2205\udfff"
|
||||||
|
+null
|
||||||
|
+"∀\ud800∃\udc00∅\udfff"
|
||||||
|
+
|
||||||
|
"inter\("pol" + "ation")"
|
||||||
|
null
|
||||||
|
"interpolation"
|
||||||
|
diff --git a/tests/shtest b/tests/shtest
|
||||||
|
index 86fec33..4c8b57e 100755
|
||||||
|
--- a/tests/shtest
|
||||||
|
+++ b/tests/shtest
|
||||||
|
@@ -130,6 +130,15 @@ printf "[1,2][3,4]\n" | $JQ -cs add > $d/out 2>&1
|
||||||
|
cmp $d/out $d/expected
|
||||||
|
|
||||||
|
|
||||||
|
+clean=false
|
||||||
|
+# Invalid UTF-8 bytes are preserved when encoding/decoding JSON
|
||||||
|
+dd if=/dev/urandom bs=1024 count=1024 >$d/rand 2>/dev/null
|
||||||
|
+$VALGRIND $Q $JQ -sR . $d/rand >$d/out.json
|
||||||
|
+$VALGRIND $Q $JQ -j . $d/out.json >$d/out
|
||||||
|
+cmp $d/out $d/rand
|
||||||
|
+clean=true
|
||||||
|
+
|
||||||
|
+
|
||||||
|
## Test streaming parser
|
||||||
|
|
||||||
|
## If we add an option to stream to the `import ... as $symbol;` directive
|
||||||
388
Correct-UTF-8-and-UTF-16-errors-during-concatenation.patch
Normal file
388
Correct-UTF-8-and-UTF-16-errors-during-concatenation.patch
Normal file
@ -0,0 +1,388 @@
|
|||||||
|
From 8829368f14943b8d2674c75805b27e56a569ad2c Mon Sep 17 00:00:00 2001
|
||||||
|
From: Max Zerzouri <maxdamantus@gmail.com>
|
||||||
|
Date: Tue, 25 May 2021 22:59:59 +1200
|
||||||
|
Subject: [PATCH] Correct UTF-8 and UTF-16 errors during concatenation
|
||||||
|
|
||||||
|
UTF-8 errors and UTF-16 errors that were previously encoded into the
|
||||||
|
ends of
|
||||||
|
strings will now potentially be used to form correct code points.
|
||||||
|
|
||||||
|
This is mostly a matter of making string equality behave expectedly, since
|
||||||
|
without this normalisation, it is possible to produce `jv` strings that are
|
||||||
|
converted to UTF-8 or UTF-16 the same way but are not equal due well-formed
|
||||||
|
code units that may or may not be encoded as errors.
|
||||||
|
---
|
||||||
|
src/jv.c | 13 ++-
|
||||||
|
src/jv_unicode.c | 248 ++++++++++++++++++++++++++++++++++++++---------
|
||||||
|
src/jv_unicode.h | 3 +
|
||||||
|
tests/jq.test | 15 +++
|
||||||
|
4 files changed, 230 insertions(+), 49 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/src/jv.c b/src/jv.c
|
||||||
|
index e979cc6..67d86fb 100644
|
||||||
|
--- a/src/jv.c
|
||||||
|
+++ b/src/jv.c
|
||||||
|
@@ -522,20 +522,27 @@ static jv jvp_string_append(jv string, const char* data, uint32_t len) {
|
||||||
|
jvp_string* s = jvp_string_ptr(string);
|
||||||
|
uint32_t currlen = jvp_string_length(s);
|
||||||
|
|
||||||
|
+ char join_buf[4];
|
||||||
|
+ int join_len = jvp_utf8_extended_join(s->data, &currlen, &data, &len, join_buf);
|
||||||
|
+
|
||||||
|
if (jvp_refcnt_unshared(string.u.ptr) &&
|
||||||
|
- jvp_string_remaining_space(s) >= len) {
|
||||||
|
+ jvp_string_remaining_space(s) >= join_len + len) {
|
||||||
|
// the next string fits at the end of a
|
||||||
|
+ memcpy(s->data + currlen, join_buf, join_len);
|
||||||
|
+ currlen += join_len;
|
||||||
|
memcpy(s->data + currlen, data, len);
|
||||||
|
s->data[currlen + len] = 0;
|
||||||
|
s->length_hashed = (currlen + len) << 1;
|
||||||
|
return string;
|
||||||
|
} else {
|
||||||
|
// allocate a bigger buffer and copy
|
||||||
|
- uint32_t allocsz = (currlen + len) * 2;
|
||||||
|
+ uint32_t allocsz = (currlen + join_len + len) * 2;
|
||||||
|
if (allocsz < 32) allocsz = 32;
|
||||||
|
jvp_string* news = jvp_string_alloc(allocsz);
|
||||||
|
- news->length_hashed = (currlen + len) << 1;
|
||||||
|
+ news->length_hashed = (currlen + join_len + len) << 1;
|
||||||
|
memcpy(news->data, s->data, currlen);
|
||||||
|
+ memcpy(news->data + currlen, join_buf, join_len);
|
||||||
|
+ currlen += join_len;
|
||||||
|
memcpy(news->data + currlen, data, len);
|
||||||
|
news->data[currlen + len] = 0;
|
||||||
|
jvp_string_free(string);
|
||||||
|
diff --git a/src/jv_unicode.c b/src/jv_unicode.c
|
||||||
|
index 8c47536..7d67300 100644
|
||||||
|
--- a/src/jv_unicode.c
|
||||||
|
+++ b/src/jv_unicode.c
|
||||||
|
@@ -1,8 +1,72 @@
|
||||||
|
#include <stdio.h>
|
||||||
|
+#include <string.h>
|
||||||
|
#include <assert.h>
|
||||||
|
#include "jv_unicode.h"
|
||||||
|
#include "jv_utf8_tables.h"
|
||||||
|
|
||||||
|
+// length of encoding of erroneous UTF-8 byte
|
||||||
|
+#define UTF8_ERR_LEN 2
|
||||||
|
+// length of encoding of erroneous UTF-16 surrogate
|
||||||
|
+#define UTF16_ERR_LEN 3
|
||||||
|
+
|
||||||
|
+#define U32(a, b, c, d) ( \
|
||||||
|
+ (uint32_t) (a) << 0 | \
|
||||||
|
+ (uint32_t) (b) << 8 | \
|
||||||
|
+ (uint32_t) (c) << 16 | \
|
||||||
|
+ (uint32_t) (d) << 24 \
|
||||||
|
+)
|
||||||
|
+
|
||||||
|
+#define BYTE(u32, n) ((uint32_t) (((u32) >> (n)*8) & 0xFF))
|
||||||
|
+
|
||||||
|
+#define B0 0x00 // 00000000
|
||||||
|
+#define B1 0x80 // 10000000
|
||||||
|
+#define B2 0xC0 // 11000000
|
||||||
|
+#define B3 0xE0 // 11100000
|
||||||
|
+#define B4 0xF0 // 11110000
|
||||||
|
+#define B5 0xF8 // 11111000
|
||||||
|
+
|
||||||
|
+// NOTE: these flags are likely to be optimised out as `decode` gets inlined
|
||||||
|
+enum decode_flags {
|
||||||
|
+ DECODE_1 = 1,
|
||||||
|
+ DECODE_2 = 2,
|
||||||
|
+ DECODE_3 = 8,
|
||||||
|
+ DECODE_4 = 16
|
||||||
|
+};
|
||||||
|
+
|
||||||
|
+// decode up to 4 bytes of "generalised UTF-8"; no checking for overlong
|
||||||
|
+// codings or out-of-range code points, works by testing all fixed bits in each
|
||||||
|
+// of the 4 coding patterns, then shifting the value bits according to the
|
||||||
|
+// pattern
|
||||||
|
+static int decode(enum decode_flags flags, uint32_t data, int* codepoint_ret) {
|
||||||
|
+ if((flags & DECODE_1) && (data & U32(B1, B0, B0, B0)) == 0){
|
||||||
|
+ *codepoint_ret = BYTE(data, 0);
|
||||||
|
+ return 1;
|
||||||
|
+ }
|
||||||
|
+ if((flags & DECODE_2) && (data & U32(B3, B2, B0, B0)) == U32(B2, B1, B0, B0)){
|
||||||
|
+ *codepoint_ret =
|
||||||
|
+ (BYTE(data, 0) & ~B3) << 6 |
|
||||||
|
+ (BYTE(data, 1) & ~B2) << 0;
|
||||||
|
+ return 2;
|
||||||
|
+ }
|
||||||
|
+ if((flags & DECODE_3) && (data & U32(B4, B2, B2, B0)) == U32(B3, B1, B1, B0)){
|
||||||
|
+ *codepoint_ret =
|
||||||
|
+ (BYTE(data, 0) & ~B4) << 12 |
|
||||||
|
+ (BYTE(data, 1) & ~B2) << 6 |
|
||||||
|
+ (BYTE(data, 2) & ~B2) << 0;
|
||||||
|
+ return 3;
|
||||||
|
+ }
|
||||||
|
+ if((flags & DECODE_4) && (data & U32(B5, B2, B2, B2)) == U32(B4, B1, B1, B1)){
|
||||||
|
+ *codepoint_ret =
|
||||||
|
+ (BYTE(data, 0) & ~B5) << 18 |
|
||||||
|
+ (BYTE(data, 1) & ~B2) << 12 |
|
||||||
|
+ (BYTE(data, 2) & ~B2) << 6 |
|
||||||
|
+ (BYTE(data, 3) & ~B2) << 0;
|
||||||
|
+ return 4;
|
||||||
|
+ }
|
||||||
|
+ *codepoint_ret = -1;
|
||||||
|
+ return 1;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
// jvp_utf8_backtrack returns the beginning of the last codepoint in the
|
||||||
|
// string, assuming that start is the last byte in the string.
|
||||||
|
// If the last codepoint is incomplete, returns the number of missing bytes via
|
||||||
|
@@ -81,56 +145,42 @@ const char* jvp_utf8_extended_next(const char* in, const char* end, enum jvp_utf
|
||||||
|
if (in == end) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
- int codepoint = -1;
|
||||||
|
- unsigned char first = (unsigned char)in[0];
|
||||||
|
- int length = utf8_coding_length[first];
|
||||||
|
- if ((first & 0x80) == 0) {
|
||||||
|
+ uint32_t data = in[0] & 0xFF;
|
||||||
|
+ if ((data & B1) == 0) {
|
||||||
|
/* Fast-path for ASCII */
|
||||||
|
- codepoint = first;
|
||||||
|
- length = 1;
|
||||||
|
- } else if (length == 0 || length == UTF8_CONTINUATION_BYTE) {
|
||||||
|
- /* Bad single byte - either an invalid byte or an out-of-place continuation byte */
|
||||||
|
- if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: bad single byte");
|
||||||
|
- length = 1;
|
||||||
|
- } else if (in + length > end) {
|
||||||
|
- /* String ends before UTF8 sequence ends */
|
||||||
|
- if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: underrun");
|
||||||
|
- length = end - in;
|
||||||
|
- } else {
|
||||||
|
- codepoint = ((unsigned)in[0]) & utf8_coding_bits[first];
|
||||||
|
- for (int i=1; i<length; i++) {
|
||||||
|
- unsigned ch = (unsigned char)in[i];
|
||||||
|
- if (utf8_coding_length[ch] != UTF8_CONTINUATION_BYTE){
|
||||||
|
- /* Invalid UTF8 sequence - not followed by the right number of continuation bytes */
|
||||||
|
- if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: wrong bytes");
|
||||||
|
- codepoint = -1;
|
||||||
|
- length = i;
|
||||||
|
- break;
|
||||||
|
- }
|
||||||
|
- codepoint = (codepoint << 6) | (ch & 0x3f);
|
||||||
|
- }
|
||||||
|
- if (codepoint < utf8_first_codepoint[length]) {
|
||||||
|
- /* Overlong UTF8 sequence */
|
||||||
|
- if ((flags & JVP_UTF8_ERRORS_UTF8) && 0x00 <= codepoint && codepoint <= 0x7F) {
|
||||||
|
- /* UTF-8 error is emitted as a negative codepoint */
|
||||||
|
- codepoint = -(codepoint + 0x80);
|
||||||
|
- } else {
|
||||||
|
- if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: overlong");
|
||||||
|
- codepoint = -1;
|
||||||
|
- }
|
||||||
|
- }
|
||||||
|
- if (0xD800 <= codepoint && codepoint <= 0xDFFF) {
|
||||||
|
- /* Surrogate codepoints are allowed in WTF-8/WTF-8b */
|
||||||
|
- if (!(flags & JVP_UTF8_ERRORS_UTF16)) {
|
||||||
|
- /* Surrogate codepoints can't be encoded in UTF8 */
|
||||||
|
- codepoint = -1;
|
||||||
|
- }
|
||||||
|
+ *codepoint_ret = data;
|
||||||
|
+ return in + 1;
|
||||||
|
+ }
|
||||||
|
+ switch (end - in) {
|
||||||
|
+ default: // fall through
|
||||||
|
+ case 4: data |= (uint32_t)(in[3] & 0xFF) << 24; // fall through
|
||||||
|
+ case 3: data |= (uint32_t)(in[2] & 0xFF) << 16; // fall through
|
||||||
|
+ case 2: data |= (uint32_t)(in[1] & 0xFF) << 8; // fall through
|
||||||
|
+ case 1: break;
|
||||||
|
+ }
|
||||||
|
+ int codepoint;
|
||||||
|
+ int length = decode(DECODE_2 | DECODE_3 | DECODE_4, data, &codepoint);
|
||||||
|
+ if (codepoint == -1) {
|
||||||
|
+ if (flags & JVP_UTF8_ERRORS_UTF8) assert(0 && "Invalid WTF-8b sequence: no match");
|
||||||
|
+ } else if (codepoint < utf8_first_codepoint[length]) {
|
||||||
|
+ /* Overlong UTF-8 sequence */
|
||||||
|
+ if ((flags & JVP_UTF8_ERRORS_UTF8) && length == UTF8_ERR_LEN && 0x00 <= codepoint && codepoint <= 0x7F) {
|
||||||
|
+ /* UTF-8 error is emitted as a negative codepoint */
|
||||||
|
+ codepoint = -(codepoint + 0x80);
|
||||||
|
+ } else {
|
||||||
|
+ if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: overlong");
|
||||||
|
+ codepoint = -1;
|
||||||
|
}
|
||||||
|
- if (codepoint > 0x10FFFF) {
|
||||||
|
- /* Outside Unicode range */
|
||||||
|
- if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: out of range");
|
||||||
|
+ } else if (0xD800 <= codepoint && codepoint <= 0xDFFF) {
|
||||||
|
+ /* Surrogate codepoints are allowed in WTF-8/WTF-8b */
|
||||||
|
+ if (!(flags & JVP_UTF8_ERRORS_UTF16)) {
|
||||||
|
+ /* Surrogate codepoints can't be encoded in UTF8 */
|
||||||
|
codepoint = -1;
|
||||||
|
}
|
||||||
|
+ } else if (codepoint > 0x10FFFF) {
|
||||||
|
+ /* Outside Unicode range */
|
||||||
|
+ if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: out of range");
|
||||||
|
+ codepoint = -1;
|
||||||
|
}
|
||||||
|
if (codepoint == -1 && (flags & JVP_UTF8_REPLACE))
|
||||||
|
codepoint = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER
|
||||||
|
@@ -139,6 +189,112 @@ const char* jvp_utf8_extended_next(const char* in, const char* end, enum jvp_utf
|
||||||
|
return in + length;
|
||||||
|
}
|
||||||
|
|
||||||
|
+// assumes two bytes are readable from `in`
|
||||||
|
+static int decode_utf8_error(const char* in) {
|
||||||
|
+ uint32_t data = U32(in[0] & 0xFF, in[1] & 0xFF, 0, 0);
|
||||||
|
+ int codepoint;
|
||||||
|
+ if (decode(DECODE_2, data, &codepoint) == UTF8_ERR_LEN && codepoint < 0x80)
|
||||||
|
+ return codepoint + 0x80;
|
||||||
|
+ return -1;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+// assumes three bytes are readable from `in`
|
||||||
|
+static int decode_utf16_error(const char* in) {
|
||||||
|
+ uint32_t data = U32(in[0] & 0xFF, in[1] & 0xFF, in[2] & 0xFF, 0);
|
||||||
|
+ int codepoint;
|
||||||
|
+ if (decode(DECODE_3, data, &codepoint) == UTF16_ERR_LEN && codepoint >= 0xD800 && codepoint < 0xDFFF)
|
||||||
|
+ return codepoint;
|
||||||
|
+ return -1;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+// jvp_utf8_extended_join attempts to turn errors at the end of `a` and the
|
||||||
|
+// beginning of `b` into a valid code point. if a correction is possible,
|
||||||
|
+// `*alen_io`, `*bstart_io` and `*blen_io` are updated to exclude the existing
|
||||||
|
+// errors, and the UTF-8 encoding of the code point to insert is stored in
|
||||||
|
+// `out`. the number of bytes that should be inserted from `out` into the
|
||||||
|
+// middle of the strings is returned (up to 4). this will be 0 if there are no
|
||||||
|
+// bytes to insert.
|
||||||
|
+int jvp_utf8_extended_join(const char* astart, uint32_t* alen_io, const char** bstart_io, uint32_t* blen_io, char* out) {
|
||||||
|
+ const char* aend = astart + *alen_io;
|
||||||
|
+ const char* bstart = *bstart_io;
|
||||||
|
+ const char* bend = bstart + *blen_io;
|
||||||
|
+ int bcp;
|
||||||
|
+ bstart = jvp_utf8_extended_next(bstart, bend, JVP_UTF8_ERRORS_ALL, &bcp);
|
||||||
|
+ if (!bstart) {
|
||||||
|
+ // end of string
|
||||||
|
+ return 0;
|
||||||
|
+ }
|
||||||
|
+ if (bcp >= 0xDC00 && bcp <= 0xDFFF) {
|
||||||
|
+ // UTF-16 tail surrogate, look for lead surrogate at the end of `a`
|
||||||
|
+ assert(bstart == *bstart_io + UTF16_ERR_LEN);
|
||||||
|
+ if (aend - astart < UTF16_ERR_LEN)
|
||||||
|
+ return 0;
|
||||||
|
+ int acp = decode_utf16_error(aend - UTF16_ERR_LEN);
|
||||||
|
+ if (acp >= 0xD800 && acp <= 0xDBFF) {
|
||||||
|
+ // UTF-16 lead surrogate, decode matching UTF-16 pair
|
||||||
|
+ *alen_io -= UTF16_ERR_LEN;
|
||||||
|
+ *blen_io -= UTF16_ERR_LEN;
|
||||||
|
+ *bstart_io += UTF16_ERR_LEN;
|
||||||
|
+ int codepoint = 0x10000 + (((acp - 0xD800) << 10) | (bcp - 0xDC00));
|
||||||
|
+ return jvp_utf8_encode(codepoint, out);
|
||||||
|
+ }
|
||||||
|
+ return 0;
|
||||||
|
+ }
|
||||||
|
+ if (bcp >= -0xFF && bcp <= -0x80) {
|
||||||
|
+ // UTF-8 error, if it's a continuation byte, search backwards in `a` for the leading byte
|
||||||
|
+ bcp = -bcp;
|
||||||
|
+ assert(bstart == *bstart_io + UTF8_ERR_LEN);
|
||||||
|
+ if (utf8_coding_length[bcp] != UTF8_CONTINUATION_BYTE)
|
||||||
|
+ return 0;
|
||||||
|
+ // if there's a correctable error, we will consume up to 4 encoded error bytes total, with up to 3 bytes from each of `a` and `b`
|
||||||
|
+ unsigned char buf[6];
|
||||||
|
+ unsigned char* bufstart = buf + 3;
|
||||||
|
+ unsigned char* bufend = bufstart;
|
||||||
|
+ *bufend++ = bcp;
|
||||||
|
+ int length;
|
||||||
|
+ // search backwards in `a` for a leading byte
|
||||||
|
+ for (;;) {
|
||||||
|
+ if (aend - astart < UTF8_ERR_LEN)
|
||||||
|
+ return 0; // `a` is too short
|
||||||
|
+ int acp = decode_utf8_error(aend - UTF8_ERR_LEN);
|
||||||
|
+ if (acp == -1)
|
||||||
|
+ return 0; // not a UTF-8 error
|
||||||
|
+ aend -= UTF8_ERR_LEN;
|
||||||
|
+ length = utf8_coding_length[acp];
|
||||||
|
+ if (length == 0)
|
||||||
|
+ return 0; // not a possible UTF-8 byte
|
||||||
|
+ *--bufstart = acp;
|
||||||
|
+ if (length != UTF8_CONTINUATION_BYTE)
|
||||||
|
+ break; // found leading byte
|
||||||
|
+ if (bufstart == buf)
|
||||||
|
+ return 0; // too many continuation bytes
|
||||||
|
+ }
|
||||||
|
+ if (bufend - bufstart > length)
|
||||||
|
+ return 0; // too many continuation bytes
|
||||||
|
+ // search forwards in `b` for any more needed continuation bytes
|
||||||
|
+ while (bufend - bufstart < length) {
|
||||||
|
+ if (bend - bstart < UTF8_ERR_LEN)
|
||||||
|
+ return 0; // `b` is too short
|
||||||
|
+ bcp = decode_utf8_error(bstart);
|
||||||
|
+ if (bcp == -1 || utf8_coding_length[bcp] != UTF8_CONTINUATION_BYTE)
|
||||||
|
+ return 0; // not a UTF-8 error, didn't find enough continuation bytes
|
||||||
|
+ bstart += UTF8_ERR_LEN;
|
||||||
|
+ *bufend++ = bcp;
|
||||||
|
+ }
|
||||||
|
+ int codepoint;
|
||||||
|
+ // check that the bytes are strict UTF-8
|
||||||
|
+ jvp_utf8_extended_next((char*)bufstart, (char*)bufend, 0, &codepoint);
|
||||||
|
+ if (codepoint != -1) {
|
||||||
|
+ memcpy(out, bufstart, 4);
|
||||||
|
+ *alen_io = aend - astart;
|
||||||
|
+ *blen_io = bend - bstart;
|
||||||
|
+ *bstart_io = bstart;
|
||||||
|
+ return bufend - bufstart;
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+ return 0;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
int jvp_utf8_is_valid(const char* in, const char* end) {
|
||||||
|
int codepoint;
|
||||||
|
while ((in = jvp_utf8_extended_next(in, end, 0, &codepoint))) {
|
||||||
|
diff --git a/src/jv_unicode.h b/src/jv_unicode.h
|
||||||
|
index 37c7fc0..ff2a437 100644
|
||||||
|
--- a/src/jv_unicode.h
|
||||||
|
+++ b/src/jv_unicode.h
|
||||||
|
@@ -1,6 +1,8 @@
|
||||||
|
#ifndef JV_UNICODE_H
|
||||||
|
#define JV_UNICODE_H
|
||||||
|
|
||||||
|
+#include <stdint.h>
|
||||||
|
+
|
||||||
|
enum jvp_utf8_flags {
|
||||||
|
/* Emit replacement character instead of -1 for errors */
|
||||||
|
JVP_UTF8_REPLACE = 1,
|
||||||
|
@@ -14,6 +16,7 @@ enum jvp_utf8_flags {
|
||||||
|
const char* jvp_utf8_backtrack(const char* start, const char* min, int *missing_bytes);
|
||||||
|
const char* jvp_utf8_extended_next(const char* in, const char* end, enum jvp_utf8_flags flags, int* codepoint);
|
||||||
|
const char* jvp_utf8_next(const char* in, const char* end, int* codepoint);
|
||||||
|
+int jvp_utf8_extended_join(const char* astart, uint32_t* alen, const char** bstart, uint32_t* blen, char* out);
|
||||||
|
int jvp_utf8_is_valid(const char* in, const char* end);
|
||||||
|
|
||||||
|
int jvp_utf8_decode_length(char startchar);
|
||||||
|
diff --git a/tests/jq.test b/tests/jq.test
|
||||||
|
index c882fd2..9e6c896 100644
|
||||||
|
--- a/tests/jq.test
|
||||||
|
+++ b/tests/jq.test
|
||||||
|
@@ -62,6 +62,11 @@ null
|
||||||
|
null
|
||||||
|
"∀\ud800∃\udc00∅\udfff"
|
||||||
|
|
||||||
|
+# Check that unpaired surrogates are paired when concatenated
|
||||||
|
+add
|
||||||
|
+["\ud83d","\ude43","\ud83e","\udd11","\ud83e","\udd17","\ud83e","\udd14","\ud83e","\udd10","\ud83d","\ude44","\ud83e","\udd12","\ud83e","\udd15","\ud83e","\udd13","\ud83e","\udd16","\ud83e","\udd18","\ud83c","\udffb","\ud83c","\udffc"]
|
||||||
|
+"🙃🤑🤗🤔🤐🙄🤒🤕🤓🤖🤘🏻🏼"
|
||||||
|
+
|
||||||
|
"inter\("pol" + "ation")"
|
||||||
|
null
|
||||||
|
"interpolation"
|
||||||
|
@@ -87,6 +92,16 @@ null
|
||||||
|
"Zm/Ds2Jhcgo="
|
||||||
|
"foóbar\n"
|
||||||
|
|
||||||
|
+# test correction of UTF-8 errors when concatenating as binary data (input is a random sequence of code points)
|
||||||
|
+. as $text | @base64 | . as $b64 | [range(0, 300)] | map($b64[(.*4):((. + 1)*4)] | @base64d) | add | . == $text
|
||||||
|
+"衍𭌞㾱ꕽ㫑𨫆\u001c㊬𗌽𘀍𗟒𩕃勸騎ᕴ𫸬椀𫎾𰣒ᮍ盕嗪𗬜𨑮𭢊氕㊁"
|
||||||
|
+true
|
||||||
|
+
|
||||||
|
+# test preservation of binary data when concatenating (input is a random sequence of UTF-16 surrogates encoded in WTF-8, should be treated as regular UTF-8 errors)
|
||||||
|
+@base64d | . as $text | @base64 | . as $b64 | [range(0, 300)] | map($b64[(.*4):((. + 1)*4)] | @base64d) | add | . == $text
|
||||||
|
+"7bKv7aiz7auX7aG37aO77aOe7auy7bmm7bqk7aG87bSH7a6m7bmc7bum7bqj7au+7bqf7aap7buC7byq7aS37aCp7aSl7a+a7bur7aGV7bGl7b6M7biB7aOe7ayR7amW7aOX7b637a+P7bu+7ayP7bOw7ba/7ayp7b6G7aqd7bG37bK57b6O7bq27a+u7a2N7ayu7bKK"
|
||||||
|
+true
|
||||||
|
+
|
||||||
|
@uri
|
||||||
|
"\u03bc"
|
||||||
|
"%CE%BC"
|
||||||
@ -0,0 +1,210 @@
|
|||||||
|
From a6ccbaad05bea30c5700b10bd51e46d390496a9b Mon Sep 17 00:00:00 2001
|
||||||
|
From: Max Zerzouri <maxdamantus@gmail.com>
|
||||||
|
Date: Sun, 16 May 2021 09:18:51 +0000
|
||||||
|
Subject: [PATCH] Update `@base64`, `utf8bytelength` and `fromjson` to handle
|
||||||
|
binary strings
|
||||||
|
|
||||||
|
---
|
||||||
|
docs/content/3.manual/manual.yml | 1 -
|
||||||
|
src/builtin.c | 107 ++++++++++++++++++++++++++-----
|
||||||
|
tests/base64.test | 10 +++
|
||||||
|
tests/shtest | 19 ++++--
|
||||||
|
4 files changed, 116 insertions(+), 21 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/docs/content/3.manual/manual.yml b/docs/content/3.manual/manual.yml
|
||||||
|
index bfb17f4..1258dbf 100644
|
||||||
|
--- a/docs/content/3.manual/manual.yml
|
||||||
|
+++ b/docs/content/3.manual/manual.yml
|
||||||
|
@@ -1843,7 +1843,6 @@ sections:
|
||||||
|
* `@base64d`:
|
||||||
|
|
||||||
|
The inverse of `@base64`, input is decoded as specified by RFC 4648.
|
||||||
|
- Note\: If the decoded string is not UTF-8, the results are undefined.
|
||||||
|
|
||||||
|
This syntax can be combined with string interpolation in a
|
||||||
|
useful way. You can follow a `@foo` token with a string
|
||||||
|
diff --git a/src/builtin.c b/src/builtin.c
|
||||||
|
index c6c8c2e..975bf49 100644
|
||||||
|
--- a/src/builtin.c
|
||||||
|
+++ b/src/builtin.c
|
||||||
|
@@ -409,10 +409,55 @@ static jv f_dump(jq_state *jq, jv input) {
|
||||||
|
static jv f_json_parse(jq_state *jq, jv input) {
|
||||||
|
if (jv_get_kind(input) != JV_KIND_STRING)
|
||||||
|
return type_error(input, "only strings can be parsed");
|
||||||
|
- jv res = jv_parse_sized(jv_string_value(input),
|
||||||
|
- jv_string_length_bytes(jv_copy(input)));
|
||||||
|
+
|
||||||
|
+ const char* i = jv_string_value(input);
|
||||||
|
+ const char* end = i + jv_string_length_bytes(jv_copy(input));
|
||||||
|
+
|
||||||
|
+ struct jv_parser* parser = jv_parser_new(0);
|
||||||
|
+ int count = 0;
|
||||||
|
+ jv value = jv_invalid();
|
||||||
|
+ while (i != NULL) {
|
||||||
|
+ const int max_utf8_len = 4;
|
||||||
|
+ unsigned char buf[100 + max_utf8_len];
|
||||||
|
+ int buflen = 0;
|
||||||
|
+ int c;
|
||||||
|
+ while ((buflen + max_utf8_len < sizeof(buf)) && (i = jvp_utf8_extended_next(i, end, JVP_UTF8_REPLACE | JVP_UTF8_ERRORS_UTF8, &c))) {
|
||||||
|
+ if (c >= -0xFF && c <= -0x80) {
|
||||||
|
+ // Invalid UTF-8 byte, pass through
|
||||||
|
+ buf[buflen++] = -c;
|
||||||
|
+ } else
|
||||||
|
+ buflen += jvp_utf8_encode(c, buf + buflen);
|
||||||
|
+ }
|
||||||
|
+ jv_parser_set_buf(parser, buf, buflen, i != NULL);
|
||||||
|
+ for (;;) {
|
||||||
|
+ jv next = jv_parser_next(parser);
|
||||||
|
+ if (!jv_is_valid(next)) {
|
||||||
|
+ if (jv_invalid_has_msg(jv_copy(next))) {
|
||||||
|
+ count++;
|
||||||
|
+ jv_free(value);
|
||||||
|
+ value = next;
|
||||||
|
+ i = NULL;
|
||||||
|
+ }
|
||||||
|
+ break;
|
||||||
|
+ }
|
||||||
|
+ jv_free(value);
|
||||||
|
+ if (count++ == 0)
|
||||||
|
+ value = next;
|
||||||
|
+ else {
|
||||||
|
+ jv_free(next);
|
||||||
|
+ value = jv_invalid_with_msg(jv_string("Unexpected extra JSON values"));
|
||||||
|
+ i = NULL;
|
||||||
|
+ break;
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+ jv_parser_free(parser);
|
||||||
|
jv_free(input);
|
||||||
|
- return res;
|
||||||
|
+ if (count == 0) {
|
||||||
|
+ jv_free(value);
|
||||||
|
+ value = jv_invalid_with_msg(jv_string("Expected JSON value"));
|
||||||
|
+ }
|
||||||
|
+ return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
static jv f_tonumber(jq_state *jq, jv input) {
|
||||||
|
@@ -457,7 +502,19 @@ static jv f_tostring(jq_state *jq, jv input) {
|
||||||
|
static jv f_utf8bytelength(jq_state *jq, jv input) {
|
||||||
|
if (jv_get_kind(input) != JV_KIND_STRING)
|
||||||
|
return type_error(input, "only strings have UTF-8 byte length");
|
||||||
|
- return jv_number(jv_string_length_bytes(input));
|
||||||
|
+ const char* i = jv_string_value(input);
|
||||||
|
+ const char* end = i + jv_string_length_bytes(jv_copy(input));
|
||||||
|
+ int len = 0;
|
||||||
|
+ int c;
|
||||||
|
+ while ((i = jvp_utf8_extended_next(i, end, JVP_UTF8_REPLACE | JVP_UTF8_ERRORS_UTF8, &c))) {
|
||||||
|
+ if (c >= -0xFF && c <= -0x80) {
|
||||||
|
+ // Invalid UTF-8 byte, will be passed through
|
||||||
|
+ len++;
|
||||||
|
+ } else
|
||||||
|
+ len += jvp_utf8_encode_length(c);
|
||||||
|
+ }
|
||||||
|
+ jv_free(input);
|
||||||
|
+ return jv_number(len);
|
||||||
|
}
|
||||||
|
|
||||||
|
#define CHARS_ALPHANUM "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
|
||||||
|
@@ -632,21 +689,41 @@ static jv f_format(jq_state *jq, jv input, jv fmt) {
|
||||||
|
jv_free(fmt);
|
||||||
|
input = f_tostring(jq, input);
|
||||||
|
jv line = jv_string("");
|
||||||
|
- const unsigned char* data = (const unsigned char*)jv_string_value(input);
|
||||||
|
- int len = jv_string_length_bytes(jv_copy(input));
|
||||||
|
- for (int i=0; i<len; i+=3) {
|
||||||
|
- uint32_t code = 0;
|
||||||
|
- int n = len - i >= 3 ? 3 : len-i;
|
||||||
|
- for (int j=0; j<3; j++) {
|
||||||
|
+ const char* i = jv_string_value(input);
|
||||||
|
+ const char* end = i + jv_string_length_bytes(jv_copy(input));
|
||||||
|
+ uint32_t code = 0;
|
||||||
|
+ int n = 0;
|
||||||
|
+ int c;
|
||||||
|
+ while ((i = jvp_utf8_extended_next(i, end, JVP_UTF8_REPLACE | JVP_UTF8_ERRORS_UTF8, &c))) {
|
||||||
|
+ unsigned char ubuf[4];
|
||||||
|
+ int len = 0;
|
||||||
|
+ if (c >= -0xFF && c <= -0x80) {
|
||||||
|
+ // Invalid UTF-8 byte, pass through
|
||||||
|
+ ubuf[len++] = -c;
|
||||||
|
+ } else
|
||||||
|
+ len += jvp_utf8_encode(c, ubuf);
|
||||||
|
+ for (int x = 0; x < len; x++) {
|
||||||
|
code <<= 8;
|
||||||
|
- code |= j < n ? (unsigned)data[i+j] : 0;
|
||||||
|
+ code |= ubuf[x];
|
||||||
|
+ if (++n == 3) {
|
||||||
|
+ char buf[4];
|
||||||
|
+ for (int j = 0; j < 4; j++)
|
||||||
|
+ buf[j] = BASE64_ENCODE_TABLE[(code >> (18 - j*6)) & 0x3f];
|
||||||
|
+ line = jv_string_append_buf(line, buf, sizeof(buf));
|
||||||
|
+ n = 0;
|
||||||
|
+ code = 0;
|
||||||
|
+ }
|
||||||
|
}
|
||||||
|
+ }
|
||||||
|
+ if (n > 0) {
|
||||||
|
+ assert(n < 3);
|
||||||
|
+ code <<= 8*(3 - n);
|
||||||
|
char buf[4];
|
||||||
|
- for (int j=0; j<4; j++) {
|
||||||
|
+ for (int j = 0; j < 4; j++)
|
||||||
|
buf[j] = BASE64_ENCODE_TABLE[(code >> (18 - j*6)) & 0x3f];
|
||||||
|
- }
|
||||||
|
- if (n < 3) buf[3] = '=';
|
||||||
|
- if (n < 2) buf[2] = '=';
|
||||||
|
+ buf[3] = '=';
|
||||||
|
+ if (n < 2)
|
||||||
|
+ buf[2] = '=';
|
||||||
|
line = jv_string_append_buf(line, buf, sizeof(buf));
|
||||||
|
}
|
||||||
|
jv_free(input);
|
||||||
|
diff --git a/tests/base64.test b/tests/base64.test
|
||||||
|
index 0f82b0b..6507bb8 100644
|
||||||
|
--- a/tests/base64.test
|
||||||
|
+++ b/tests/base64.test
|
||||||
|
@@ -33,3 +33,13 @@
|
||||||
|
. | try @base64d catch .
|
||||||
|
"QUJDa"
|
||||||
|
"string (\"QUJDa\") trailing base64 byte found"
|
||||||
|
+
|
||||||
|
+# random binary data
|
||||||
|
+(. | @base64d | @base64) == .
|
||||||
|
+"zns0Su1i4JjDfGiR95WOcU8iiPMOrfJTUBm9P1ot2qIMiyk04b0WSIFNTMD7w9ziMV8nSbwpPqNl3JKF1eWZrRRg24rbvh66O1e7Z1xIGPNqTqm+jdzRCkWSryR+67wXRVgD6Q=="
|
||||||
|
+true
|
||||||
|
+
|
||||||
|
+# replace lone surrogates
|
||||||
|
+@base64
|
||||||
|
+"foo\udca9\ud83dbar"
|
||||||
|
+"Zm9v77+977+9YmFy"
|
||||||
|
diff --git a/tests/shtest b/tests/shtest
|
||||||
|
index 4c8b57e..7de61e4 100755
|
||||||
|
--- a/tests/shtest
|
||||||
|
+++ b/tests/shtest
|
||||||
|
@@ -131,11 +131,20 @@ cmp $d/out $d/expected
|
||||||
|
|
||||||
|
|
||||||
|
clean=false
|
||||||
|
-# Invalid UTF-8 bytes are preserved when encoding/decoding JSON
|
||||||
|
-dd if=/dev/urandom bs=1024 count=1024 >$d/rand 2>/dev/null
|
||||||
|
-$VALGRIND $Q $JQ -sR . $d/rand >$d/out.json
|
||||||
|
-$VALGRIND $Q $JQ -j . $d/out.json >$d/out
|
||||||
|
-cmp $d/out $d/rand
|
||||||
|
+# Invalid UTF-8 bytes are preserved when encoding/decoding JSON and base64 and concatenating binary strings
|
||||||
|
+if dd if=/dev/urandom bs=1024 count=1024 >$d/rand 2>/dev/null; then
|
||||||
|
+ $VALGRIND $Q $JQ -sR . $d/rand >$d/out.json
|
||||||
|
+ $VALGRIND $Q $JQ -j . $d/out.json >$d/out
|
||||||
|
+ cmp $d/out $d/rand
|
||||||
|
+ $VALGRIND $Q $JQ -jR fromjson $d/out.json >$d/out
|
||||||
|
+ cmp $d/out $d/rand
|
||||||
|
+ $VALGRIND $Q $JQ -j '@base64 | @base64d' $d/out.json >$d/out
|
||||||
|
+ cmp $d/out $d/rand
|
||||||
|
+ base64 $d/rand | $VALGRIND $Q $JQ -R '@base64d' | $VALGRIND $Q $JQ -sj 'add' >$d/out
|
||||||
|
+ cmp $d/out $d/rand
|
||||||
|
+ $VALGRIND $Q $JQ -nj '$a' --rawfile a $d/rand >$d/out
|
||||||
|
+ cmp $d/out $d/rand
|
||||||
|
+fi
|
||||||
|
clean=true
|
||||||
|
|
||||||
|
|
||||||
16
jq.spec
16
jq.spec
@ -1,12 +1,17 @@
|
|||||||
Name: jq
|
Name: jq
|
||||||
Version: 1.6
|
Version: 1.6
|
||||||
Release: 1
|
Release: 2
|
||||||
Summary: A lightweight and flexible command-line JSON processor
|
Summary: A lightweight and flexible command-line JSON processor
|
||||||
License: MIT and ASL 2.0 and CC-BY and GPLv3
|
License: MIT and ASL 2.0 and CC-BY and GPLv3
|
||||||
URL: http://stedolan.github.io/jq/
|
URL: http://stedolan.github.io/jq/
|
||||||
Source0: https://github.com/stedolan/jq/releases/download/jq-%{version}/jq-%{version}.tar.gz
|
Source0: https://github.com/stedolan/jq/releases/download/jq-%{version}/jq-%{version}.tar.gz
|
||||||
BuildRequires: make flex bison valgrind gcc chrpath oniguruma-devel
|
BuildRequires: make flex bison valgrind gcc chrpath oniguruma-devel
|
||||||
|
|
||||||
|
Patch0001: jv_string_implode-avoid-producing-unprintable-string-fromreserved-code-points.patch
|
||||||
|
Patch0002: Binary-strings-preserve-UTF-8-and-UTF-16-errors.patch
|
||||||
|
Patch0003: Update-base64-utf8bytelength-and-fromjson-to-handlebinary-strings.patch
|
||||||
|
Patch0004: Correct-UTF-8-and-UTF-16-errors-during-concatenation.patch
|
||||||
|
|
||||||
%description
|
%description
|
||||||
jq is a lightweight and flexible command-line JSON processor.
|
jq is a lightweight and flexible command-line JSON processor.
|
||||||
you can use it to slice and filter and map and transform structured data.
|
you can use it to slice and filter and map and transform structured data.
|
||||||
@ -28,15 +33,15 @@ BuildArch: noarch
|
|||||||
Documentation for jq package.
|
Documentation for jq package.
|
||||||
|
|
||||||
%prep
|
%prep
|
||||||
%autosetup -n jq-%{version}
|
%autosetup -n jq-%{version} -p1
|
||||||
|
|
||||||
%build
|
%build
|
||||||
%configure --disable-static
|
%configure
|
||||||
%make_build
|
%make_build
|
||||||
|
|
||||||
%install
|
%install
|
||||||
%make_install
|
%make_install
|
||||||
%delete_la
|
%delete_la_and_a
|
||||||
chrpath -d %{buildroot}%{_bindir}/%{name}
|
chrpath -d %{buildroot}%{_bindir}/%{name}
|
||||||
|
|
||||||
%check
|
%check
|
||||||
@ -70,6 +75,9 @@ make check
|
|||||||
|
|
||||||
|
|
||||||
%changelog
|
%changelog
|
||||||
|
* Mon Aug 30 2021 lingsheng <lingsheng@huawei.com> - 1.6-2
|
||||||
|
- Support binary strings preserve UTF-8 and UTF-16 errors
|
||||||
|
|
||||||
* Wed Aug 25 2021 wangyue <wangyue92@huawei.com> - 1.6-1
|
* Wed Aug 25 2021 wangyue <wangyue92@huawei.com> - 1.6-1
|
||||||
- Upgrade to 1.6
|
- Upgrade to 1.6
|
||||||
|
|
||||||
|
|||||||
@ -0,0 +1,23 @@
|
|||||||
|
From e165542664e9fe3c155eeb13e16320a07dfbd5fd Mon Sep 17 00:00:00 2001
|
||||||
|
From: Max Zerzouri <maxdamantus@gmail.com>
|
||||||
|
Date: Sat, 15 May 2021 10:50:15 +0000
|
||||||
|
Subject: [PATCH] jv_string_implode: avoid producing unprintable string from
|
||||||
|
reserved code points
|
||||||
|
|
||||||
|
---
|
||||||
|
src/jv.c | 2 +-
|
||||||
|
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||||
|
|
||||||
|
diff --git a/src/jv.c b/src/jv.c
|
||||||
|
index 979d188..1f1029e 100644
|
||||||
|
--- a/src/jv.c
|
||||||
|
+++ b/src/jv.c
|
||||||
|
@@ -725,7 +725,7 @@ jv jv_string_implode(jv j) {
|
||||||
|
jv n = jv_array_get(jv_copy(j), i);
|
||||||
|
assert(jv_get_kind(n) == JV_KIND_NUMBER);
|
||||||
|
int nv = jv_number_value(n);
|
||||||
|
- if (nv > 0x10FFFF)
|
||||||
|
+ if (nv < 0 || (nv >= 0xD800 && nv <= 0xDFFF) || nv > 0x10FFFF)
|
||||||
|
nv = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER
|
||||||
|
s = jv_string_append_codepoint(s, nv);
|
||||||
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user