From 2d962c63218ebf8aed46ac918d9a783b8106f0ce Mon Sep 17 00:00:00 2001 From: gitlost Date: Sun, 21 Mar 2021 17:35:52 +0000 Subject: [PATCH] get_best_eci: check UTF-8 before returning 26; move is_valid_utf8 to common --- backend/common.c | 14 ++++++++++++ backend/common.h | 1 + backend/eci.c | 4 ++++ backend/library.c | 30 +++++++------------------- backend/tests/test_common.c | 40 ++++++++++++++++++++++++++++++++++ backend/tests/test_eci.c | 37 +++++++++++++++++++++++++++++++ backend/tests/test_library.c | 42 ------------------------------------ 7 files changed, 104 insertions(+), 64 deletions(-) diff --git a/backend/common.c b/backend/common.c index 47d48fe6..e0803014 100644 --- a/backend/common.c +++ b/backend/common.c @@ -335,6 +335,20 @@ INTERNAL unsigned int decode_utf8(unsigned int *state, unsigned int *codep, cons return *state; } +/* Is string valid UTF-8? */ +INTERNAL int is_valid_utf8(const unsigned char source[], const int length) { + int i; + unsigned int codepoint, state = 0; + + for (i = 0; i < length; i++) { + if (decode_utf8(&state, &codepoint, source[i]) == 12) { + return 0; + } + } + + return state == 0; +} + /* Convert UTF-8 to Unicode. If `disallow_4byte` unset, allow all values (UTF-32). If `disallow_4byte` set, * only allow codepoints <= U+FFFF (ie four-byte sequences not allowed) (UTF-16, no surrogates) */ INTERNAL int utf8_to_unicode(struct zint_symbol *symbol, const unsigned char source[], unsigned int vals[], diff --git a/backend/common.h b/backend/common.h index b7855312..c6c4b071 100644 --- a/backend/common.h +++ b/backend/common.h @@ -112,6 +112,7 @@ extern "C" { INTERNAL int is_composite(const int symbology); INTERNAL int istwodigits(const unsigned char source[], const int length, const int position); INTERNAL unsigned int decode_utf8(unsigned int *state, unsigned int *codep, const unsigned char byte); + INTERNAL int is_valid_utf8(const unsigned char source[], const int length); INTERNAL int utf8_to_unicode(struct zint_symbol *symbol, const unsigned char source[], unsigned int vals[], int *length, const int disallow_4byte); INTERNAL void set_minimum_height(struct zint_symbol *symbol, const int min_height); diff --git a/backend/eci.c b/backend/eci.c index 7b084c41..5fd89f63 100644 --- a/backend/eci.c +++ b/backend/eci.c @@ -271,5 +271,9 @@ INTERNAL int get_best_eci(const unsigned char source[], int length) { eci++; } while (eci < 25); + if (!is_valid_utf8(source, length)) { + return 0; + } + return 26; // If all of these fail, use Unicode! } diff --git a/backend/library.c b/backend/library.c index 3fa691f8..34397181 100644 --- a/backend/library.c +++ b/backend/library.c @@ -1072,20 +1072,6 @@ static int escape_char_process(struct zint_symbol *symbol, unsigned char *input_ return error_number; } -/* Is string valid UTF-8? */ -STATIC_UNLESS_ZINT_TEST int is_valid_utf8(const unsigned char source[], const int length) { - int i; - unsigned int codepoint, state = 0; - - for (i = 0; i < length; i++) { - if (decode_utf8(&state, &codepoint, source[i]) == 12) { - return 0; - } - } - - return state == 0; -} - int ZBarcode_Encode(struct zint_symbol *symbol, const unsigned char *source, int in_length) { int error_number, warn_number; #ifdef _MSC_VER @@ -1327,15 +1313,15 @@ int ZBarcode_Encode(struct zint_symbol *symbol, const unsigned char *source, int && (symbol->input_mode & 0x07) == UNICODE_MODE) { /* Try another ECI mode */ symbol->eci = get_best_eci(local_source, in_length); - - error_number = extended_or_reduced_charset(symbol, local_source, in_length); - - if (error_number == 0) { - error_number = ZINT_WARN_USES_ECI; - if (!(symbol->debug & ZINT_DEBUG_TEST)) { - strcpy(symbol->errtxt, "222: Encoded data includes ECI"); + if (symbol->eci != 0) { + error_number = extended_or_reduced_charset(symbol, local_source, in_length); + if (error_number == 0) { + error_number = ZINT_WARN_USES_ECI; + if (!(symbol->debug & ZINT_DEBUG_TEST)) { + strcpy(symbol->errtxt, "222: Encoded data includes ECI"); + } + if (symbol->debug & ZINT_DEBUG_PRINT) printf("Added ECI %d\n", symbol->eci); } - if (symbol->debug & ZINT_DEBUG_PRINT) printf("Added ECI %d\n", symbol->eci); } } diff --git a/backend/tests/test_common.c b/backend/tests/test_common.c index 9cc14636..b008a957 100644 --- a/backend/tests/test_common.c +++ b/backend/tests/test_common.c @@ -79,6 +79,45 @@ static void test_utf8_to_unicode(int index, int debug) { testFinish(); } +static void test_is_valid_utf8(int index) { + + testStart(""); + + int ret; + struct item { + char* data; + int length; + int ret; + char* comment; + }; + // s/\/\*[ 0-9]*\*\//\=printf("\/*%3d*\/", line(".") - line("'<")) + struct item data[] = { + /* 0*/ { "", -1, 1, "" }, + /* 1*/ { "abcdefghijklmnopqrstuvwxyz", -1, 1, "" }, + /* 2*/ { "éa", -1, 1, "" }, + /* 3*/ { "a\000b", 3, 1, "Embedded nul" }, + /* 4*/ { "\357\273\277a", -1, 1, "Bom" }, + + /* 5*/ { "a\xC2", -1, 0, "Missing 2nd byte" }, + /* 6*/ { "a\200b", -1, 0, "Orphan continuation 0x80" }, + /* 7*/ { "\300\201", -1, 0, "Overlong 0xC081" }, + /* 8*/ { "\355\240\200", -1, 0, "Surrogate 0xEDA080" }, + }; + int data_size = ARRAY_SIZE(data); + + for (int i = 0; i < data_size; i++) { + + if (index != -1 && i != index) continue; + + int length = data[i].length == -1 ? (int) strlen(data[i].data) : data[i].length; + + ret = is_valid_utf8((const unsigned char *) data[i].data, length); + assert_equal(ret, data[i].ret, "i:%d ret %d != %d\n", i, ret, data[i].ret); + } + + testFinish(); +} + static void test_debug_test_codeword_dump_int(int index, int debug) { testStart(""); @@ -115,6 +154,7 @@ int main(int argc, char *argv[]) { testFunction funcs[] = { /* name, func, has_index, has_generate, has_debug */ { "test_utf8_to_unicode", test_utf8_to_unicode, 1, 0, 1 }, { "test_debug_test_codeword_dump_int", test_debug_test_codeword_dump_int, 1, 0, 1 }, + { "test_is_valid_utf8", test_is_valid_utf8, 1, 0, 0 }, }; testRun(argc, argv, funcs, ARRAY_SIZE(funcs)); diff --git a/backend/tests/test_eci.c b/backend/tests/test_eci.c index 621122b0..e6ed1586 100644 --- a/backend/tests/test_eci.c +++ b/backend/tests/test_eci.c @@ -791,6 +791,42 @@ static void test_utf8_to_eci_ucs2be(void) { } }; +static void test_get_best_eci(int index) { + + testStart(""); + + int ret; + struct item { + const char *data; + int length; + int ret; + }; + // s/\/\*[ 0-9]*\*\//\=printf("\/*%3d*\/", line(".") - line("'<")) + struct item data[] = { + /* 0*/ { "\300\301", -1, 0 }, + /* 1*/ { "ÀÁ", -1, 3 }, + /* 2*/ { "Ђ", -1, 7 }, + /* 3*/ { "Ѐ", -1, 26 }, // Cyrillic U+0400 not in single-byte code pages + /* 4*/ { "β", -1, 9 }, + /* 5*/ { "˜", -1, 23 }, + /* 6*/ { "βЂ", -1, 26 }, + /* 7*/ { "AB\200", -1, 0 }, + }; + int data_size = ARRAY_SIZE(data); + + for (int i = 0; i < data_size; i++) { + + if (index != -1 && i != index) continue; + + int length = data[i].length == -1 ? (int) strlen(data[i].data) : data[i].length; + + ret = get_best_eci((const unsigned char *) data[i].data, length); + assert_equal(ret, data[i].ret, "i:%d get_best_eci ret %d != %d\n", i, ret, data[i].ret); + } + + testFinish(); +} + int main(int argc, char *argv[]) { testFunction funcs[] = { /* name, func, has_index, has_generate, has_debug */ @@ -800,6 +836,7 @@ int main(int argc, char *argv[]) { { "test_utf8_to_eci_sb", test_utf8_to_eci_sb, 1, 0, 0 }, { "test_utf8_to_eci_ascii", test_utf8_to_eci_ascii, 0, 0, 0 }, { "test_utf8_to_eci_ucs2be", test_utf8_to_eci_ucs2be, 0, 0, 0 }, + { "test_get_best_eci", test_get_best_eci, 1, 0, 0 }, }; testRun(argc, argv, funcs, ARRAY_SIZE(funcs)); diff --git a/backend/tests/test_library.c b/backend/tests/test_library.c index d7885234..cd91df21 100644 --- a/backend/tests/test_library.c +++ b/backend/tests/test_library.c @@ -589,47 +589,6 @@ static void test_strip_bom(void) { testFinish(); } -STATIC_UNLESS_ZINT_TEST int is_valid_utf8(const unsigned char source[], const int length); - -static void test_is_valid_utf8(int index) { - - testStart(""); - - int ret; - struct item { - char* data; - int length; - int ret; - char* comment; - }; - // s/\/\*[ 0-9]*\*\//\=printf("\/*%3d*\/", line(".") - line("'<")) - struct item data[] = { - /* 0*/ { "", -1, 1, "" }, - /* 1*/ { "abcdefghijklmnopqrstuvwxyz", -1, 1, "" }, - /* 2*/ { "éa", -1, 1, "" }, - /* 3*/ { "a\000b", 3, 1, "Embedded nul" }, - /* 4*/ { "\357\273\277a", -1, 1, "Bom" }, - - /* 5*/ { "a\xC2", -1, 0, "Missing 2nd byte" }, - /* 6*/ { "a\200b", -1, 0, "Orphan continuation 0x80" }, - /* 7*/ { "\300\201", -1, 0, "Overlong 0xC081" }, - /* 8*/ { "\355\240\200", -1, 0, "Surrogate 0xEDA080" }, - }; - int data_size = ARRAY_SIZE(data); - - for (int i = 0; i < data_size; i++) { - - if (index != -1 && i != index) continue; - - int length = data[i].length == -1 ? (int) strlen(data[i].data) : data[i].length; - - ret = is_valid_utf8((const unsigned char *) data[i].data, length); - assert_equal(ret, data[i].ret, "i:%d ret %d != %d\n", i, ret, data[i].ret); - } - - testFinish(); -} - int main(int argc, char *argv[]) { testFunction funcs[] = { /* name, func, has_index, has_generate, has_debug */ @@ -643,7 +602,6 @@ int main(int argc, char *argv[]) { { "test_valid_id", test_valid_id, 0, 0, 0 }, { "test_error_tag", test_error_tag, 1, 0, 0 }, { "test_strip_bom", test_strip_bom, 0, 0, 0 }, - { "test_is_valid_utf8", test_is_valid_utf8, 1, 0, 0 }, }; testRun(argc, argv, funcs, ARRAY_SIZE(funcs));