--- a/ucx/json.c Mon Jan 06 21:18:56 2025 +0100 +++ b/ucx/json.c Sun Feb 23 13:11:32 2025 +0100 @@ -27,13 +27,10 @@ */ #include "cx/json.h" -#include "cx/compare.h" #include <string.h> -#include <ctype.h> #include <assert.h> #include <stdio.h> -#include <errno.h> #include <inttypes.h> /* @@ -135,6 +132,16 @@ } } +static bool json_isdigit(char c) { + // TODO: remove once UCX has public API for this + return c >= '0' && c <= '9'; +} + +static bool json_isspace(char c) { + // TODO: remove once UCX has public API for this + return c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\v' || c == '\f'; +} + static int num_isexp(const char *content, size_t length, size_t pos) { if (pos >= length) { return 0; @@ -143,7 +150,7 @@ int ok = 0; for (size_t i = pos; i < length; i++) { char c = content[i]; - if (isdigit(c)) { + if (json_isdigit(c)) { ok = 1; } else if (i == pos) { if (!(c == '+' || c == '-')) { @@ -160,7 +167,7 @@ static CxJsonTokenType token_numbertype(const char *content, size_t length) { if (length == 0) return CX_JSON_TOKEN_ERROR; - if (content[0] != '-' && !isdigit(content[0])) { + if (content[0] != '-' && !json_isdigit(content[0])) { return CX_JSON_TOKEN_ERROR; } @@ -173,7 +180,7 @@ type = CX_JSON_TOKEN_NUMBER; } else if (content[i] == 'e' || content[i] == 'E') { return num_isexp(content, length, i + 1) ? CX_JSON_TOKEN_NUMBER : CX_JSON_TOKEN_ERROR; - } else if (!isdigit(content[i])) { + } else if (!json_isdigit(content[i])) { return CX_JSON_TOKEN_ERROR; // char is not a digit, decimal separator or exponent sep } } @@ -237,7 +244,7 @@ return CX_JSON_TOKEN_STRING; } default: { - if (isspace(c)) { + if (json_isspace(c)) { return CX_JSON_TOKEN_SPACE; } } @@ -254,7 +261,10 @@ // current token type and start index CxJsonTokenType ttype = json->uncompleted.tokentype; - size_t token_start = json->buffer.pos; + size_t token_part_start = json->buffer.pos; + + bool escape_end_of_string = ttype == CX_JSON_TOKEN_STRING + && json->uncompleted.content.ptr[json->uncompleted.content.length-1] == '\\'; for (size_t i = json->buffer.pos; i < json->buffer.size; i++) { char c = json->buffer.space[i]; @@ -268,7 +278,7 @@ } else if (ctype == CX_JSON_TOKEN_STRING) { // begin string ttype = CX_JSON_TOKEN_STRING; - token_start = i; + token_part_start = i; } else if (ctype != CX_JSON_NO_TOKEN) { // single-char token json->buffer.pos = i + 1; @@ -276,12 +286,12 @@ return CX_JSON_NO_ERROR; } else { ttype = CX_JSON_TOKEN_LITERAL; // number or literal - token_start = i; + token_part_start = i; } } else { // finish token if (ctype != CX_JSON_NO_TOKEN) { - *result = token_create(json, false, token_start, i); + *result = token_create(json, false, token_part_start, i); if (result->tokentype == CX_JSON_NO_TOKEN) { return CX_JSON_BUFFER_ALLOC_FAILED; // LCOV_EXCL_LINE } @@ -294,18 +304,18 @@ } } else { // currently inside a string - if (json->tokenizer_escape) { - json->tokenizer_escape = false; + if (escape_end_of_string) { + escape_end_of_string = false; } else { if (c == '"') { - *result = token_create(json, true, token_start, i + 1); + *result = token_create(json, true, token_part_start, i + 1); if (result->tokentype == CX_JSON_NO_TOKEN) { return CX_JSON_BUFFER_ALLOC_FAILED; // LCOV_EXCL_LINE } json->buffer.pos = i + 1; return CX_JSON_NO_ERROR; } else if (c == '\\') { - json->tokenizer_escape = true; + escape_end_of_string = true; } } } @@ -313,13 +323,13 @@ if (ttype != CX_JSON_NO_TOKEN) { // uncompleted token - size_t uncompleted_len = json->buffer.size - token_start; + size_t uncompleted_len = json->buffer.size - token_part_start; if (json->uncompleted.tokentype == CX_JSON_NO_TOKEN) { // current token is uncompleted // save current token content CxJsonToken uncompleted = { ttype, true, - cx_strdup(cx_strn(json->buffer.space + token_start, uncompleted_len)) + cx_strdup(cx_strn(json->buffer.space + token_part_start, uncompleted_len)) }; if (uncompleted.content.ptr == NULL) { return CX_JSON_BUFFER_ALLOC_FAILED; // LCOV_EXCL_LINE @@ -330,7 +340,7 @@ // combine the uncompleted token with the current token assert(json->uncompleted.allocated); cxmutstr str = cx_strcat_m(json->uncompleted.content, 1, - cx_strn(json->buffer.space + token_start, uncompleted_len)); + cx_strn(json->buffer.space + token_part_start, uncompleted_len)); if (str.ptr == NULL) { return CX_JSON_BUFFER_ALLOC_FAILED; // LCOV_EXCL_LINE } @@ -343,9 +353,75 @@ return CX_JSON_INCOMPLETE_DATA; } +// converts a Unicode codepoint to utf8 +static unsigned codepoint_to_utf8(uint32_t codepoint, char *output_buf) { + if (codepoint <= 0x7F) { + *output_buf = (char)codepoint; + return 1; + } else if (codepoint <= 0x7FF) { + output_buf[0] = (char)(0xC0 | ((codepoint >> 6) & 0x1F)); + output_buf[1] = (char)(0x80 | (codepoint & 0x3F)); + return 2; + } else if (codepoint <= 0xFFFF) { + output_buf[0] = (char)(0xE0 | ((codepoint >> 12) & 0x0F)); + output_buf[1] = (char)(0x80 | ((codepoint >> 6) & 0x3F)); + output_buf[2] = (char)(0x80 | (codepoint & 0x3F)); + return 3; + } else if (codepoint <= 0x10FFFF) { + output_buf[0] = (char)(0xF0 | ((codepoint >> 18) & 0x07)); + output_buf[1] = (char)(0x80 | ((codepoint >> 12) & 0x3F)); + output_buf[2] = (char)(0x80 | ((codepoint >> 6) & 0x3F)); + output_buf[3] = (char)(0x80 | (codepoint & 0x3F)); + return 4; + } + + return 0; // LCOV_EXCL_LINE +} + +// converts a utf16 surrogate pair to utf8 +static inline uint32_t utf16pair_to_codepoint(uint16_t c0, uint16_t c1) { + return ((c0 - 0xD800) << 10) + (c1 - 0xDC00) + 0x10000; +} + +static unsigned unescape_unicode_string(cxstring str, char *utf8buf) { + // str is supposed to start with "\uXXXX" or "\uXXXX\uXXXX" + // remaining bytes in the string are ignored (str may be larger!) + + if (str.length < 6 || str.ptr[0] != '\\' || str.ptr[1] != 'u') { + return 0; + } + + unsigned utf8len = 0; + cxstring ustr1 = { str.ptr + 2, 4}; + uint16_t utf16a, utf16b; + if (!cx_strtou16_lc(ustr1, &utf16a, 16, "")) { + uint32_t codepoint; + if (utf16a < 0xD800 || utf16a > 0xE000) { + // character is in the Basic Multilingual Plane + // and encoded as a single utf16 char + codepoint = utf16a; + utf8len = codepoint_to_utf8(codepoint, utf8buf); + } else if (utf16a >= 0xD800 && utf16a <= 0xDBFF) { + // character is encoded as a surrogate pair + // get next 6 bytes + if (str.length >= 12) { + if (str.ptr[6] == '\\' && str.ptr[7] == 'u') { + cxstring ustr2 = { str.ptr+8, 4 }; + if (!cx_strtou16_lc(ustr2, &utf16b, 16, "") + && utf16b >= 0xDC00 && utf16b <= 0xDFFF) { + codepoint = utf16pair_to_codepoint(utf16a, utf16b); + utf8len = codepoint_to_utf8(codepoint, utf8buf); + } + } + } + } + } + return utf8len; +} + static cxmutstr unescape_string(const CxAllocator *a, cxmutstr str) { - // TODO: support more escape sequences - // we know that the unescaped string will be shorter by at least 2 chars + // note: this function expects that str contains the enclosing quotes! + cxmutstr result; result.length = 0; result.ptr = cxMalloc(a, str.length - 1); @@ -358,9 +434,45 @@ u = false; if (c == 'n') { c = '\n'; + } else if (c == '"') { + c = '"'; } else if (c == 't') { c = '\t'; + } else if (c == 'r') { + c = '\r'; + } else if (c == '\\') { + c = '\\'; + } else if (c == '/') { + c = '/'; // always unescape, we don't need settings here + } else if (c == 'f') { + c = '\f'; + } else if (c == 'b') { + c = '\b'; + } else if (c == 'u') { + char utf8buf[4]; + unsigned utf8len = unescape_unicode_string( + cx_strn(str.ptr + i - 1, str.length + 1 - i), + utf8buf + ); + if(utf8len > 0) { + i += utf8len < 4 ? 4 : 10; + // add all bytes from utf8buf except the last char + // to the result (last char will be added below) + utf8len--; + c = utf8buf[utf8len]; + for (unsigned x = 0; x < utf8len; x++) { + result.ptr[result.length++] = utf8buf[x]; + } + } else { + // decoding failed, ignore the entire sequence + result.ptr[result.length++] = '\\'; + } + } else { + // TODO: discuss the behavior for unrecognized escape sequences + // most parsers throw an error here - we just ignore it + result.ptr[result.length++] = '\\'; } + result.ptr[result.length++] = c; } else { if (c == '\\') { @@ -375,7 +487,60 @@ return result; } -static CxJsonValue* create_json_value(CxJson *json, CxJsonValueType type) { +static cxmutstr escape_string(cxmutstr str, bool escape_slash) { + // note: this function produces the string without enclosing quotes + // the reason is that we don't want to allocate memory just for that + CxBuffer buf = {0}; + + bool all_printable = true; + for (size_t i = 0; i < str.length; i++) { + unsigned char c = str.ptr[i]; + bool escape = c < 0x20 || c == '\\' || c == '"' + || (escape_slash && c == '/'); + + if (all_printable && escape) { + size_t capa = str.length + 32; + char *space = malloc(capa); + if (space == NULL) return cx_mutstrn(NULL, 0); + cxBufferInit(&buf, space, capa, NULL, CX_BUFFER_AUTO_EXTEND); + cxBufferWrite(str.ptr, 1, i, &buf); + all_printable = false; + } + if (escape) { + cxBufferPut(&buf, '\\'); + if (c == '\"') { + cxBufferPut(&buf, '\"'); + } else if (c == '\n') { + cxBufferPut(&buf, 'n'); + } else if (c == '\t') { + cxBufferPut(&buf, 't'); + } else if (c == '\r') { + cxBufferPut(&buf, 'r'); + } else if (c == '\\') { + cxBufferPut(&buf, '\\'); + } else if (c == '/') { + cxBufferPut(&buf, '/'); + } else if (c == '\f') { + cxBufferPut(&buf, 'f'); + } else if (c == '\b') { + cxBufferPut(&buf, 'b'); + } else { + char code[6]; + snprintf(code, sizeof(code), "u%04x", (unsigned int) c); + cxBufferPutString(&buf, code); + } + } else if (!all_printable) { + cxBufferPut(&buf, c); + } + } + if (!all_printable) { + str = cx_mutstrn(buf.space, buf.size); + } + cxBufferDestroy(&buf); + return str; +} + +static CxJsonValue* json_create_value(CxJson *json, CxJsonValueType type) { CxJsonValue *v = cxCalloc(json->allocator, 1, sizeof(CxJsonValue)); if (v == NULL) return NULL; // LCOV_EXCL_LINE @@ -541,21 +706,21 @@ json_add_state(json, 10 + state); switch (token.tokentype) { case CX_JSON_TOKEN_BEGIN_ARRAY: { - if (create_json_value(json, CX_JSON_ARRAY) == NULL) { + if (json_create_value(json, CX_JSON_ARRAY) == NULL) { return_rec(CX_JSON_VALUE_ALLOC_FAILED); // LCOV_EXCL_LINE } json_add_state(json, JP_STATE_VALUE_BEGIN_AR); return_rec(CX_JSON_NO_ERROR); } case CX_JSON_TOKEN_BEGIN_OBJECT: { - if (create_json_value(json, CX_JSON_OBJECT) == NULL) { + if (json_create_value(json, CX_JSON_OBJECT) == NULL) { return_rec(CX_JSON_VALUE_ALLOC_FAILED); // LCOV_EXCL_LINE } json_add_state(json, JP_STATE_OBJ_NAME_OR_CLOSE); return_rec(CX_JSON_NO_ERROR); } case CX_JSON_TOKEN_STRING: { - if ((vbuf = create_json_value(json, CX_JSON_STRING)) == NULL) { + if ((vbuf = json_create_value(json, CX_JSON_STRING)) == NULL) { return_rec(CX_JSON_VALUE_ALLOC_FAILED); // LCOV_EXCL_LINE } cxmutstr str = unescape_string(json->allocator, token.content); @@ -568,7 +733,7 @@ case CX_JSON_TOKEN_INTEGER: case CX_JSON_TOKEN_NUMBER: { int type = token.tokentype == CX_JSON_TOKEN_INTEGER ? CX_JSON_INTEGER : CX_JSON_NUMBER; - if (NULL == (vbuf = create_json_value(json, type))) { + if (NULL == (vbuf = json_create_value(json, type))) { return_rec(CX_JSON_VALUE_ALLOC_FAILED); // LCOV_EXCL_LINE } if (type == CX_JSON_INTEGER) { @@ -583,7 +748,7 @@ return_rec(CX_JSON_NO_ERROR); } case CX_JSON_TOKEN_LITERAL: { - if ((vbuf = create_json_value(json, CX_JSON_LITERAL)) == NULL) { + if ((vbuf = json_create_value(json, CX_JSON_LITERAL)) == NULL) { return_rec(CX_JSON_VALUE_ALLOC_FAILED); // LCOV_EXCL_LINE } if (0 == cx_strcmp(cx_strcast(token.content), cx_str("true"))) { @@ -734,6 +899,7 @@ } CxJsonValue* cxJsonCreateObj(const CxAllocator* allocator) { + if (allocator == NULL) allocator = cxDefaultAllocator; CxJsonValue* v = cxMalloc(allocator, sizeof(CxJsonValue)); if (v == NULL) return NULL; v->allocator = allocator; @@ -755,6 +921,7 @@ } CxJsonValue* cxJsonCreateArr(const CxAllocator* allocator) { + if (allocator == NULL) allocator = cxDefaultAllocator; CxJsonValue* v = cxMalloc(allocator, sizeof(CxJsonValue)); if (v == NULL) return NULL; v->allocator = allocator; @@ -765,6 +932,7 @@ } CxJsonValue* cxJsonCreateNumber(const CxAllocator* allocator, double num) { + if (allocator == NULL) allocator = cxDefaultAllocator; CxJsonValue* v = cxMalloc(allocator, sizeof(CxJsonValue)); if (v == NULL) return NULL; v->allocator = allocator; @@ -774,6 +942,7 @@ } CxJsonValue* cxJsonCreateInteger(const CxAllocator* allocator, int64_t num) { + if (allocator == NULL) allocator = cxDefaultAllocator; CxJsonValue* v = cxMalloc(allocator, sizeof(CxJsonValue)); if (v == NULL) return NULL; v->allocator = allocator; @@ -787,6 +956,7 @@ } CxJsonValue* cxJsonCreateCxString(const CxAllocator* allocator, cxstring str) { + if (allocator == NULL) allocator = cxDefaultAllocator; CxJsonValue* v = cxMalloc(allocator, sizeof(CxJsonValue)); if (v == NULL) return NULL; v->allocator = allocator; @@ -798,6 +968,7 @@ } CxJsonValue* cxJsonCreateLiteral(const CxAllocator* allocator, CxJsonLiteral lit) { + if (allocator == NULL) allocator = cxDefaultAllocator; CxJsonValue* v = cxMalloc(allocator, sizeof(CxJsonValue)); if (v == NULL) return NULL; v->allocator = allocator; @@ -808,7 +979,7 @@ // LCOV_EXCL_START // never called as long as malloc() does not return NULL -static void cx_json_arr_free_temp(CxJsonValue** values, size_t count) { +static void json_arr_free_temp(CxJsonValue** values, size_t count) { for (size_t i = 0; i < count; i++) { if (values[i] == NULL) break; cxJsonValueFree(values[i]); @@ -822,7 +993,7 @@ if (values == NULL) return -1; for (size_t i = 0; i < count; i++) { values[i] = cxJsonCreateNumber(arr->allocator, num[i]); - if (values[i] == NULL) { cx_json_arr_free_temp(values, count); return -1; } + if (values[i] == NULL) { json_arr_free_temp(values, count); return -1; } } int ret = cxJsonArrAddValues(arr, values, count); free(values); @@ -834,7 +1005,7 @@ if (values == NULL) return -1; for (size_t i = 0; i < count; i++) { values[i] = cxJsonCreateInteger(arr->allocator, num[i]); - if (values[i] == NULL) { cx_json_arr_free_temp(values, count); return -1; } + if (values[i] == NULL) { json_arr_free_temp(values, count); return -1; } } int ret = cxJsonArrAddValues(arr, values, count); free(values); @@ -846,7 +1017,7 @@ if (values == NULL) return -1; for (size_t i = 0; i < count; i++) { values[i] = cxJsonCreateString(arr->allocator, str[i]); - if (values[i] == NULL) { cx_json_arr_free_temp(values, count); return -1; } + if (values[i] == NULL) { json_arr_free_temp(values, count); return -1; } } int ret = cxJsonArrAddValues(arr, values, count); free(values); @@ -858,7 +1029,7 @@ if (values == NULL) return -1; for (size_t i = 0; i < count; i++) { values[i] = cxJsonCreateCxString(arr->allocator, str[i]); - if (values[i] == NULL) { cx_json_arr_free_temp(values, count); return -1; } + if (values[i] == NULL) { json_arr_free_temp(values, count); return -1; } } int ret = cxJsonArrAddValues(arr, values, count); free(values); @@ -870,7 +1041,7 @@ if (values == NULL) return -1; for (size_t i = 0; i < count; i++) { values[i] = cxJsonCreateLiteral(arr->allocator, lit[i]); - if (values[i] == NULL) { cx_json_arr_free_temp(values, count); return -1; } + if (values[i] == NULL) { json_arr_free_temp(values, count); return -1; } } int ret = cxJsonArrAddValues(arr, values, count); free(values); @@ -979,25 +1150,25 @@ } } -static const CxJsonWriter cx_json_writer_default = { - false, - true, - 255, - false, - 4 -}; - CxJsonWriter cxJsonWriterCompact(void) { - return cx_json_writer_default; + return (CxJsonWriter) { + false, + true, + 6, + false, + 4, + false + }; } CxJsonWriter cxJsonWriterPretty(bool use_spaces) { return (CxJsonWriter) { true, true, - 255, + 6, use_spaces, - 4 + 4, + false }; } @@ -1044,7 +1215,7 @@ size_t actual = 0, expected = 0; // small buffer for number to string conversions - char numbuf[32]; + char numbuf[40]; // recursively write the values switch (value->type) { @@ -1078,9 +1249,11 @@ // the name actual += wfunc("\"", 1, 1, target); - // TODO: escape the string - actual += wfunc(member->name.ptr, 1, - member->name.length, target); + cxmutstr name = escape_string(member->name, settings->escape_slash); + actual += wfunc(name.ptr, 1, name.length, target); + if (name.ptr != member->name.ptr) { + cx_strfree(&name); + } actual += wfunc("\"", 1, 1, target); const char *obj_name_sep = ": "; if (settings->pretty) { @@ -1146,20 +1319,81 @@ } case CX_JSON_STRING: { actual += wfunc("\"", 1, 1, target); - // TODO: escape the string - actual += wfunc(value->value.string.ptr, 1, - value->value.string.length, target); + cxmutstr str = escape_string(value->value.string, settings->escape_slash); + actual += wfunc(str.ptr, 1, str.length, target); + if (str.ptr != value->value.string.ptr) { + cx_strfree(&str); + } actual += wfunc("\"", 1, 1, target); expected += 2 + value->value.string.length; break; } case CX_JSON_NUMBER: { - // TODO: locale bullshit - // TODO: formatting settings - snprintf(numbuf, 32, "%g", value->value.number); - size_t len = strlen(numbuf); - actual += wfunc(numbuf, 1, len, target); - expected += len; + int precision = settings->frac_max_digits; + // because of the way how %g is defined, we need to + // double the precision and truncate ourselves + precision = 1 + (precision > 15 ? 30 : 2 * precision); + snprintf(numbuf, 40, "%.*g", precision, value->value.number); + char *dot, *exp; + unsigned char max_digits; + // find the decimal separator and hope that it's one of . or , + dot = strchr(numbuf, '.'); + if (dot == NULL) { + dot = strchr(numbuf, ','); + } + if (dot == NULL) { + // no decimal separator found + // output everything until a possible exponent + max_digits = 30; + dot = numbuf; + } else { + // found a decimal separator + // output everything until the separator + // and set max digits to what the settings say + size_t len = dot - numbuf; + actual += wfunc(numbuf, 1, len, target); + expected += len; + max_digits = settings->frac_max_digits; + if (max_digits > 15) { + max_digits = 15; + } + // locale independent separator + if (max_digits > 0) { + actual += wfunc(".", 1, 1, target); + expected++; + } + dot++; + } + // find the exponent + exp = strchr(dot, 'e'); + if (exp == NULL) { + // no exponent - output the rest + if (max_digits > 0) { + size_t len = strlen(dot); + if (len > max_digits) { + len = max_digits; + } + actual += wfunc(dot, 1, len, target); + expected += len; + } + } else { + // exponent found - truncate the frac digits + // and then output the rest + if (max_digits > 0) { + size_t len = exp - dot - 1; + if (len > max_digits) { + len = max_digits; + } + actual += wfunc(dot, 1, len, target); + expected += len; + } + actual += wfunc("e", 1, 1, target); + expected++; + exp++; + size_t len = strlen(exp); + actual += wfunc(exp, 1, len, target); + expected += len; + } break; } case CX_JSON_INTEGER: { @@ -1201,12 +1435,13 @@ cx_write_func wfunc, const CxJsonWriter *settings ) { - if (settings == NULL) { - settings = &cx_json_writer_default; - } assert(target != NULL); assert(value != NULL); assert(wfunc != NULL); + CxJsonWriter writer_default = cxJsonWriterCompact(); + if (settings == NULL) { + settings = &writer_default; + } return cx_json_write_rec(target, value, wfunc, settings, 0); }