From 2d962c63218ebf8aed46ac918d9a783b8106f0ce Mon Sep 17 00:00:00 2001
From: gitlost <burmartke@gmail.com>
Date: Sun, 21 Mar 2021 17:35:52 +0000
Subject: [PATCH] get_best_eci: check UTF-8 before returning 26; move
 is_valid_utf8 to common

---
 backend/common.c             | 14 ++++++++++++
 backend/common.h             |  1 +
 backend/eci.c                |  4 ++++
 backend/library.c            | 30 +++++++-------------------
 backend/tests/test_common.c  | 40 ++++++++++++++++++++++++++++++++++
 backend/tests/test_eci.c     | 37 +++++++++++++++++++++++++++++++
 backend/tests/test_library.c | 42 ------------------------------------
 7 files changed, 104 insertions(+), 64 deletions(-)

diff --git a/backend/common.c b/backend/common.c
index 47d48fe6..e0803014 100644
--- a/backend/common.c
+++ b/backend/common.c
@@ -335,6 +335,20 @@ INTERNAL unsigned int decode_utf8(unsigned int *state, unsigned int *codep, cons
     return *state;
 }
 
+/* Is string valid UTF-8? */
+INTERNAL int is_valid_utf8(const unsigned char source[], const int length) {
+    int i;
+    unsigned int codepoint, state = 0;
+
+    for (i = 0; i < length; i++) {
+        if (decode_utf8(&state, &codepoint, source[i]) == 12) {
+            return 0;
+        }
+    }
+
+    return state == 0;
+}
+
 /* Convert UTF-8 to Unicode. If `disallow_4byte` unset, allow all values (UTF-32). If `disallow_4byte` set,
  * only allow codepoints <= U+FFFF (ie four-byte sequences not allowed) (UTF-16, no surrogates) */
 INTERNAL int utf8_to_unicode(struct zint_symbol *symbol, const unsigned char source[], unsigned int vals[],
diff --git a/backend/common.h b/backend/common.h
index b7855312..c6c4b071 100644
--- a/backend/common.h
+++ b/backend/common.h
@@ -112,6 +112,7 @@ extern "C" {
     INTERNAL int is_composite(const int symbology);
     INTERNAL int istwodigits(const unsigned char source[], const int length, const int position);
     INTERNAL unsigned int decode_utf8(unsigned int *state, unsigned int *codep, const unsigned char byte);
+    INTERNAL int is_valid_utf8(const unsigned char source[], const int length);
     INTERNAL int utf8_to_unicode(struct zint_symbol *symbol, const unsigned char source[], unsigned int vals[],
                     int *length, const int disallow_4byte);
     INTERNAL void set_minimum_height(struct zint_symbol *symbol, const int min_height);
diff --git a/backend/eci.c b/backend/eci.c
index 7b084c41..5fd89f63 100644
--- a/backend/eci.c
+++ b/backend/eci.c
@@ -271,5 +271,9 @@ INTERNAL int get_best_eci(const unsigned char source[], int length) {
         eci++;
     } while (eci < 25);
 
+    if (!is_valid_utf8(source, length)) {
+        return 0;
+    }
+
     return 26; // If all of these fail, use Unicode!
 }
diff --git a/backend/library.c b/backend/library.c
index 3fa691f8..34397181 100644
--- a/backend/library.c
+++ b/backend/library.c
@@ -1072,20 +1072,6 @@ static int escape_char_process(struct zint_symbol *symbol, unsigned char *input_
     return error_number;
 }
 
-/* Is string valid UTF-8? */
-STATIC_UNLESS_ZINT_TEST int is_valid_utf8(const unsigned char source[], const int length) {
-    int i;
-    unsigned int codepoint, state = 0;
-
-    for (i = 0; i < length; i++) {
-        if (decode_utf8(&state, &codepoint, source[i]) == 12) {
-            return 0;
-        }
-    }
-
-    return state == 0;
-}
-
 int ZBarcode_Encode(struct zint_symbol *symbol, const unsigned char *source, int in_length) {
     int error_number, warn_number;
 #ifdef _MSC_VER
@@ -1327,15 +1313,15 @@ int ZBarcode_Encode(struct zint_symbol *symbol, const unsigned char *source, int
             && (symbol->input_mode & 0x07) == UNICODE_MODE) {
         /* Try another ECI mode */
         symbol->eci = get_best_eci(local_source, in_length);
-
-        error_number = extended_or_reduced_charset(symbol, local_source, in_length);
-
-        if (error_number == 0) {
-            error_number = ZINT_WARN_USES_ECI;
-            if (!(symbol->debug & ZINT_DEBUG_TEST)) {
-                strcpy(symbol->errtxt, "222: Encoded data includes ECI");
+        if (symbol->eci != 0) {
+            error_number = extended_or_reduced_charset(symbol, local_source, in_length);
+            if (error_number == 0) {
+                error_number = ZINT_WARN_USES_ECI;
+                if (!(symbol->debug & ZINT_DEBUG_TEST)) {
+                    strcpy(symbol->errtxt, "222: Encoded data includes ECI");
+                }
+                if (symbol->debug & ZINT_DEBUG_PRINT) printf("Added ECI %d\n", symbol->eci);
             }
-            if (symbol->debug & ZINT_DEBUG_PRINT) printf("Added ECI %d\n", symbol->eci);
         }
     }
 
diff --git a/backend/tests/test_common.c b/backend/tests/test_common.c
index 9cc14636..b008a957 100644
--- a/backend/tests/test_common.c
+++ b/backend/tests/test_common.c
@@ -79,6 +79,45 @@ static void test_utf8_to_unicode(int index, int debug) {
     testFinish();
 }
 
+static void test_is_valid_utf8(int index) {
+
+    testStart("");
+
+    int ret;
+    struct item {
+        char* data;
+        int length;
+        int ret;
+        char* comment;
+    };
+    // s/\/\*[ 0-9]*\*\//\=printf("\/*%3d*\/", line(".") - line("'<"))
+    struct item data[] = {
+        /*  0*/ { "", -1, 1, "" },
+        /*  1*/ { "abcdefghijklmnopqrstuvwxyz", -1, 1, "" },
+        /*  2*/ { "éa", -1, 1, "" },
+        /*  3*/ { "a\000b", 3, 1, "Embedded nul" },
+        /*  4*/ { "\357\273\277a", -1, 1, "Bom" },
+
+        /*  5*/ { "a\xC2", -1, 0, "Missing 2nd byte" },
+        /*  6*/ { "a\200b", -1, 0, "Orphan continuation 0x80" },
+        /*  7*/ { "\300\201", -1, 0, "Overlong 0xC081" },
+        /*  8*/ { "\355\240\200", -1, 0, "Surrogate 0xEDA080" },
+    };
+    int data_size = ARRAY_SIZE(data);
+
+    for (int i = 0; i < data_size; i++) {
+
+        if (index != -1 && i != index) continue;
+
+        int length = data[i].length == -1 ? (int) strlen(data[i].data) : data[i].length;
+
+        ret = is_valid_utf8((const unsigned char *) data[i].data, length);
+        assert_equal(ret, data[i].ret, "i:%d ret %d != %d\n", i, ret, data[i].ret);
+    }
+
+    testFinish();
+}
+
 static void test_debug_test_codeword_dump_int(int index, int debug) {
 
     testStart("");
@@ -115,6 +154,7 @@ int main(int argc, char *argv[]) {
     testFunction funcs[] = { /* name, func, has_index, has_generate, has_debug */
         { "test_utf8_to_unicode", test_utf8_to_unicode, 1, 0, 1 },
         { "test_debug_test_codeword_dump_int", test_debug_test_codeword_dump_int, 1, 0, 1 },
+        { "test_is_valid_utf8", test_is_valid_utf8, 1, 0, 0 },
     };
 
     testRun(argc, argv, funcs, ARRAY_SIZE(funcs));
diff --git a/backend/tests/test_eci.c b/backend/tests/test_eci.c
index 621122b0..e6ed1586 100644
--- a/backend/tests/test_eci.c
+++ b/backend/tests/test_eci.c
@@ -791,6 +791,42 @@ static void test_utf8_to_eci_ucs2be(void) {
     }
 };
 
+static void test_get_best_eci(int index) {
+
+    testStart("");
+
+    int ret;
+    struct item {
+        const char *data;
+        int length;
+        int ret;
+    };
+    // s/\/\*[ 0-9]*\*\//\=printf("\/*%3d*\/", line(".") - line("'<"))
+    struct item data[] = {
+        /*  0*/ { "\300\301", -1, 0 },
+        /*  1*/ { "ÀÁ", -1, 3 },
+        /*  2*/ { "Ђ", -1, 7 },
+        /*  3*/ { "Ѐ", -1, 26 }, // Cyrillic U+0400 not in single-byte code pages
+        /*  4*/ { "β", -1, 9 },
+        /*  5*/ { "˜", -1, 23 },
+        /*  6*/ { "βЂ", -1, 26 },
+        /*  7*/ { "AB\200", -1, 0 },
+    };
+    int data_size = ARRAY_SIZE(data);
+
+    for (int i = 0; i < data_size; i++) {
+
+        if (index != -1 && i != index) continue;
+
+        int length = data[i].length == -1 ? (int) strlen(data[i].data) : data[i].length;
+
+        ret = get_best_eci((const unsigned char *) data[i].data, length);
+        assert_equal(ret, data[i].ret, "i:%d get_best_eci ret %d != %d\n", i, ret, data[i].ret);
+    }
+
+    testFinish();
+}
+
 int main(int argc, char *argv[]) {
 
     testFunction funcs[] = { /* name, func, has_index, has_generate, has_debug */
@@ -800,6 +836,7 @@ int main(int argc, char *argv[]) {
         { "test_utf8_to_eci_sb", test_utf8_to_eci_sb, 1, 0, 0 },
         { "test_utf8_to_eci_ascii", test_utf8_to_eci_ascii, 0, 0, 0 },
         { "test_utf8_to_eci_ucs2be", test_utf8_to_eci_ucs2be, 0, 0, 0 },
+        { "test_get_best_eci", test_get_best_eci, 1, 0, 0 },
     };
 
     testRun(argc, argv, funcs, ARRAY_SIZE(funcs));
diff --git a/backend/tests/test_library.c b/backend/tests/test_library.c
index d7885234..cd91df21 100644
--- a/backend/tests/test_library.c
+++ b/backend/tests/test_library.c
@@ -589,47 +589,6 @@ static void test_strip_bom(void) {
     testFinish();
 }
 
-STATIC_UNLESS_ZINT_TEST int is_valid_utf8(const unsigned char source[], const int length);
-
-static void test_is_valid_utf8(int index) {
-
-    testStart("");
-
-    int ret;
-    struct item {
-        char* data;
-        int length;
-        int ret;
-        char* comment;
-    };
-    // s/\/\*[ 0-9]*\*\//\=printf("\/*%3d*\/", line(".") - line("'<"))
-    struct item data[] = {
-        /*  0*/ { "", -1, 1, "" },
-        /*  1*/ { "abcdefghijklmnopqrstuvwxyz", -1, 1, "" },
-        /*  2*/ { "éa", -1, 1, "" },
-        /*  3*/ { "a\000b", 3, 1, "Embedded nul" },
-        /*  4*/ { "\357\273\277a", -1, 1, "Bom" },
-
-        /*  5*/ { "a\xC2", -1, 0, "Missing 2nd byte" },
-        /*  6*/ { "a\200b", -1, 0, "Orphan continuation 0x80" },
-        /*  7*/ { "\300\201", -1, 0, "Overlong 0xC081" },
-        /*  8*/ { "\355\240\200", -1, 0, "Surrogate 0xEDA080" },
-    };
-    int data_size = ARRAY_SIZE(data);
-
-    for (int i = 0; i < data_size; i++) {
-
-        if (index != -1 && i != index) continue;
-
-        int length = data[i].length == -1 ? (int) strlen(data[i].data) : data[i].length;
-
-        ret = is_valid_utf8((const unsigned char *) data[i].data, length);
-        assert_equal(ret, data[i].ret, "i:%d ret %d != %d\n", i, ret, data[i].ret);
-    }
-
-    testFinish();
-}
-
 int main(int argc, char *argv[]) {
 
     testFunction funcs[] = { /* name, func, has_index, has_generate, has_debug */
@@ -643,7 +602,6 @@ int main(int argc, char *argv[]) {
         { "test_valid_id", test_valid_id, 0, 0, 0 },
         { "test_error_tag", test_error_tag, 1, 0, 0 },
         { "test_strip_bom", test_strip_bom, 0, 0, 0 },
-        { "test_is_valid_utf8", test_is_valid_utf8, 1, 0, 0 },
     };
 
     testRun(argc, argv, funcs, ARRAY_SIZE(funcs));