mirror of
https://github.com/zint/zint
synced 2024-11-16 20:57:25 +13:00
e30e1f9651
QRCODE: use stricter interpretation of ZINT_FULL_MULTIBYTE, excluding certain trailing bytes libzint: fix some confusing error messages introduced by segment stuff general: new escape chars \U, \d and \o backend_qt: fudge rendering of border rectangles due to scaling/translation rounding errors TODO: better fudge GUI: foreground/background colours -> text boxes and icon buttons, add swap button, independently movable picker (NULL parent), preview colour changes, preview Data Window changes, add clear data (del) buttons, add zap button and Factory Reset menu option, various other fixes libzint: remove STATIC_UNLESS_ZINT_TEST, use wrappers CMake: add find package QtSvg, remove QtXml manual: split symbology and general specs and sort, move DAFT to 4-state, UPC/EAN -> EAN/UPC, DataBar -> GS1 DataBar always, expand MAILMARK info, various other fiddlings man page: options or -> |, expand MSI Plessey check digit options README.linux: add packages info license: add SPDX-License-Identifier to touched files
717 lines
21 KiB
PHP
717 lines
21 KiB
PHP
<?php
|
|
/* Generate ECI multibyte tables from unicode.org mapping files */
|
|
/*
|
|
libzint - the open source barcode library
|
|
Copyright (C) 2022 Robin Stuart <rstuart114@gmail.com>
|
|
*/
|
|
/* SPDX-License-Identifier: BSD-3-Clause */
|
|
/*
|
|
* To create "backend/eci_mb.h" (from project root directory):
|
|
*
|
|
* php backend/tools/gen_eci_mb_h.php
|
|
*
|
|
* NOTE: backend/tools/data/GB18030.TXT will have to be downloaded first from the tarball
|
|
* https://haible.de/bruno/charsets/conversion-tables/GB18030.tar.bz2
|
|
* using the version jdk-1.4.2/GB18030.TXT
|
|
*/
|
|
// 'zint.assertions' should set to 1 in php.ini
|
|
|
|
$copyright_text = <<<'EOD'
|
|
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions
|
|
are met:
|
|
|
|
1. Redistributions of source code must retain the above copyright
|
|
notice, this list of conditions and the following disclaimer.
|
|
2. Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in the
|
|
documentation and/or other materials provided with the distribution.
|
|
3. Neither the name of the project nor the names of its contributors
|
|
may be used to endorse or promote products derived from this software
|
|
without specific prior written permission.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
|
|
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
SUCH DAMAGE.
|
|
*/
|
|
/* SPDX-License-Identifier: BSD-3-Clause */
|
|
|
|
EOD;
|
|
|
|
$basename = basename(__FILE__);
|
|
$dirname = dirname(__FILE__);
|
|
|
|
$opts = getopt('d:o:');
|
|
$data_dirname = isset($opts['d']) ? $opts['d'] : ($dirname . '/data'); // Where to load file from.
|
|
$out_dirname = isset($opts['o']) ? $opts['o'] : ($dirname . '/..'); // Where to put output.
|
|
|
|
$year = 2022;
|
|
|
|
function out_header(&$out, $name, $descr, $file, $start_year = 0, $extra_comment = '') {
|
|
global $copyright_text, $basename, $year;
|
|
$caps = strtoupper($name);
|
|
$out[] = '/* ' . $name . '.h - tables for Unicode to ' . $descr . ',';
|
|
$out[] = ' ' . 'generated by "backend/tools/' . $basename . '" from';
|
|
if ($extra_comment !== '') {
|
|
$out[] = ' "' . $file . '"';
|
|
$out[] = ' ' . $extra_comment . ' */';
|
|
} else {
|
|
$out[] = ' "' . $file . '" */';
|
|
}
|
|
$out[] = '/*';
|
|
$out[] = ' libzint - the open source barcode library';
|
|
if ($start_year && $start_year != $year) {
|
|
$out[] = ' Copyright (C) ' . $start_year . '-' . $year . ' Robin Stuart <rstuart114@gmail.com>';
|
|
} else {
|
|
$out[] = ' Copyright (C) ' . $year . ' Robin Stuart <rstuart114@gmail.com>';
|
|
}
|
|
$out = array_merge($out, explode("\n", $copyright_text));
|
|
$out[] = '#ifndef Z_' . $caps . '_H';
|
|
$out[] = '#define Z_' . $caps . '_H';
|
|
}
|
|
|
|
/* Output a block of table entries to `$out` array */
|
|
function out_tab_entries(&$out, $arr, $cnt, $not_hex = false) {
|
|
$line = ' ';
|
|
for ($i = 0; $i < $cnt; $i++) {
|
|
if ($i && $i % 8 === 0) {
|
|
$out[] = $line;
|
|
$line = ' ';
|
|
}
|
|
if ($not_hex) {
|
|
$line .= sprintf(' %5d,', $arr[$i]);
|
|
} else {
|
|
$line .= sprintf(' 0x%04X,', $arr[$i]);
|
|
}
|
|
}
|
|
if ($line !== ' ') {
|
|
$out[] = $line;
|
|
}
|
|
}
|
|
|
|
/* Output tables to `$out` array */
|
|
function out_tabs(&$out, $name, $sort, $mb, $no_u_ind = false, $u_comment = '', $mb_comment = '') {
|
|
if ($u_comment == '') $u_comment = 'Unicode codepoints sorted';
|
|
$cnt_sort = count($sort);
|
|
$out[] = '';
|
|
$out[] = '/* ' . $u_comment . ' */';
|
|
$out[] = 'static const unsigned short ' . $name . '_u[' . $cnt_sort . '] = {';
|
|
out_tab_entries($out, $sort, $cnt_sort);
|
|
$out[] = '};';
|
|
|
|
if (!empty($mb)) {
|
|
if ($mb_comment == '') $mb_comment = 'Multibyte values sorted in Unicode order';
|
|
$cnt = count($mb);
|
|
$out[] = '';
|
|
$out[] = '/* ' . $mb_comment . ' */';
|
|
$out[] = 'static const unsigned short ' . $name . '_mb[' . $cnt . '] = {';
|
|
$line = ' ';
|
|
out_tab_entries($out, $mb, $cnt);
|
|
$out[] = '};';
|
|
}
|
|
if (!$no_u_ind) {
|
|
$ind_cnt = ($sort[$cnt_sort - 1] >> 8) + 1;
|
|
$out[] = '';
|
|
$out[] = '/* Indexes into Unicode `' . $name . '_u[]` array in blocks of 0x100 */';
|
|
$ind_idx = count($out);
|
|
$out[] = 'static const unsigned short ' . $name . '_u_ind[] = {';
|
|
$line = ' ';
|
|
$i = 0;
|
|
foreach ($sort as $ind => $u) {
|
|
$div = ($u - $sort[0]) >> 8;
|
|
while ($div >= $i) {
|
|
if ($i && $i % 8 === 0) {
|
|
$out[] = $line;
|
|
$line = ' ';
|
|
}
|
|
$line .= sprintf(' %5d,', $ind);
|
|
$i++;
|
|
}
|
|
}
|
|
if ($line !== ' ') {
|
|
$out[] = $line;
|
|
$line = ' ';
|
|
}
|
|
$out[] = '};';
|
|
$out[$ind_idx] = 'static const unsigned short ' . $name . '_u_ind[' . $i . '] = {';
|
|
}
|
|
}
|
|
|
|
/* Helper to output special-case URO (Unified Repertoire and Ordering) block (U+4E00-U+9FFF) tables */
|
|
function out_uro_tabs(&$out, $name, $tab_uro_u, $tab_uro_mb_ind) {
|
|
$cnt = count($tab_uro_u);
|
|
$out[] = '';
|
|
$out[] = '/* Unicode usage bit-flags for URO (U+4E00-U+9FFF) block */';
|
|
$out[] = 'static const unsigned short ' . $name . '_uro_u[' . $cnt . '] = {';
|
|
out_tab_entries($out, $tab_uro_u, $cnt);
|
|
$cnt = count($tab_uro_mb_ind);
|
|
$out[] = '};';
|
|
$out[] = '';
|
|
$out[] = '/* Multibyte indexes for URO (U+4E00-U+9FFF) block */';
|
|
$out[] = 'static const unsigned short ' . $name . '_uro_mb_ind[' . $cnt . '] = {';
|
|
out_tab_entries($out, $tab_uro_mb_ind, $cnt, true /*not_hex*/);
|
|
$out[] = '};';
|
|
}
|
|
|
|
// BIG5
|
|
|
|
$out = array();
|
|
|
|
out_header($out, 'big5', 'Big5', 'https://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/BIG5.TXT', 2021);
|
|
|
|
$file = $data_dirname . '/' . 'BIG5.TXT';
|
|
|
|
// Read the file.
|
|
|
|
if (($get = file_get_contents($file)) === false) {
|
|
error_log($error = "$basename: ERROR: Could not read mapping file \"$file\"");
|
|
exit($error . PHP_EOL);
|
|
}
|
|
|
|
$lines = explode("\n", $get);
|
|
|
|
// Parse the file.
|
|
|
|
$sort = array();
|
|
$mb = array();
|
|
foreach ($lines as $line) {
|
|
$line = trim($line);
|
|
if ($line === '' || strncmp($line, '0x', 2) !== 0 || strpos($line, "*** NO MAPPING ***") !== false) {
|
|
continue;
|
|
}
|
|
$matches = array();
|
|
if (preg_match('/^0x([0-9A-F]{4})[ \t]+0x([0-9A-F]{4})[ \t].*$/', $line, $matches)) {
|
|
$d = hexdec($matches[1]);
|
|
$u = hexdec($matches[2]);
|
|
$sort[] = $u;
|
|
$mb[] = $d;
|
|
}
|
|
}
|
|
|
|
array_multisort($sort, $mb);
|
|
|
|
// Calculate URO (U+4E00-U+9FFF) table
|
|
for ($u_i = 0, $cnt = count($sort); $u_i < $cnt && $sort[$u_i] < 0x4E00; $u_i++);
|
|
|
|
$start_u_i = $u_i;
|
|
$big5_uro_u = $big5_uro_mb_ind = array();
|
|
$sort_search = array_flip($sort);
|
|
for ($u = 0x4E00; $u <= 0x9FAF; $u += 16) {
|
|
$used = 0;
|
|
$next_u_i = $u_i;
|
|
for ($j = 0; $j < 16; $j++) {
|
|
if (isset($sort_search[$u + $j])) {
|
|
$i = $sort_search[$u + $j];
|
|
$used |= 1 << $j;
|
|
$next_u_i = $i + 1;
|
|
$end_u_i = $i;
|
|
}
|
|
}
|
|
$big5_uro_u[] = $used;
|
|
$big5_uro_mb_ind[] = $u_i;
|
|
$u_i = $next_u_i;
|
|
}
|
|
|
|
// Output URO tables
|
|
out_uro_tabs($out, 'big5', $big5_uro_u, $big5_uro_mb_ind);
|
|
|
|
// Remove URO block from Unicode table
|
|
array_splice($sort, $start_u_i, $end_u_i - $start_u_i + 1);
|
|
|
|
// Output Big5 tables
|
|
|
|
out_tabs($out, 'big5', $sort, $mb, true /*no_ind*/);
|
|
|
|
$out[] = '';
|
|
$out[] = '#endif /* Z_BIG5_H */';
|
|
|
|
file_put_contents($out_dirname . '/big5.h', implode("\n", $out) . "\n");
|
|
|
|
// EUC-KR (KS X 1001)
|
|
|
|
$out = array();
|
|
|
|
out_header($out, 'ksx1001', 'EUC-KR (KS X 1001)',
|
|
'https://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/KSC/KSX1001.TXT', 2021);
|
|
|
|
$file = $data_dirname . '/' . 'KSX1001.TXT';
|
|
|
|
// Read the file.
|
|
|
|
if (($get = file_get_contents($file)) === false) {
|
|
error_log($error = "$basename: ERROR: Could not read mapping file \"$file\"");
|
|
exit($error . PHP_EOL);
|
|
}
|
|
|
|
$lines = explode("\n", $get);
|
|
|
|
// Parse the file.
|
|
|
|
$sort = array();
|
|
$mb = array();
|
|
foreach ($lines as $line) {
|
|
$line = trim($line);
|
|
if ($line === '' || strncmp($line, '0x', 2) !== 0 || strpos($line, "*** NO MAPPING ***") !== false) {
|
|
continue;
|
|
}
|
|
$matches = array();
|
|
if (preg_match('/^0x([0-9A-F]{4})[ \t]+0x([0-9A-F]{4})[ \t].*$/', $line, $matches)) {
|
|
$d = hexdec($matches[1]) + 0x8080; // Convert to EUC-KR
|
|
$u = hexdec($matches[2]);
|
|
$sort[] = $u;
|
|
$mb[] = $d;
|
|
}
|
|
}
|
|
|
|
// Add some characters defined later than in KSX1001.TXT
|
|
|
|
$sort[] = 0x20AC; // Euro sign added KS X 1001:1998
|
|
$mb[] = 0x2266 + 0x8080;
|
|
|
|
$sort[] = 0xAE; // Registered trademark added KS X 1001:1998
|
|
$mb[] = 0x2267 + 0x8080;
|
|
|
|
$sort[] = 0x327E; // Korean postal code symbol added KS X 1001:2002
|
|
$mb[]= 0x2268 + 0x8080;
|
|
|
|
array_multisort($sort, $mb);
|
|
|
|
// Calculate URO (U+4E00-U+9FFF) table
|
|
for ($u_i = 0, $cnt = count($sort); $u_i < $cnt && $sort[$u_i] < 0x4E00; $u_i++);
|
|
|
|
$start_u_i = $u_i;
|
|
$ksx1001_uro_u = $ksx1001_uro_mb_ind = array();
|
|
$sort_search = array_flip($sort);
|
|
for ($u = 0x4E00; $u <= 0x9F9F; $u += 16) {
|
|
$used = 0;
|
|
$next_u_i = $u_i;
|
|
for ($j = 0; $j < 16; $j++) {
|
|
if (isset($sort_search[$u + $j])) {
|
|
$i = $sort_search[$u + $j];
|
|
$used |= 1 << $j;
|
|
$next_u_i = $i + 1;
|
|
$end_u_i = $i;
|
|
}
|
|
}
|
|
$ksx1001_uro_u[] = $used;
|
|
$ksx1001_uro_mb_ind[] = $u_i;
|
|
$u_i = $next_u_i;
|
|
}
|
|
|
|
// Output URO tables
|
|
out_uro_tabs($out, 'ksx1001', $ksx1001_uro_u, $ksx1001_uro_mb_ind);
|
|
|
|
// Remove URO block from Unicode table
|
|
array_splice($sort, $start_u_i, $end_u_i - $start_u_i + 1);
|
|
|
|
// Output KS X 1001 tables
|
|
out_tabs($out, 'ksx1001', $sort, $mb);
|
|
|
|
$out[] = '';
|
|
$out[] = '#endif /* Z_KSX1001_H */';
|
|
|
|
file_put_contents($out_dirname . '/ksx1001.h', implode("\n", $out) . "\n");
|
|
|
|
// Shift JIS
|
|
|
|
$out = array();
|
|
|
|
out_header($out, 'sjis', 'Shift JIS', 'https://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/SHIFTJIS.TXT', 2009);
|
|
|
|
$file = $data_dirname . '/' . 'SHIFTJIS.TXT';
|
|
|
|
// Read the file.
|
|
|
|
if (($get = file_get_contents($file)) === false) {
|
|
error_log($error = "$basename: ERROR: Could not read mapping file \"$file\"");
|
|
exit($error . PHP_EOL);
|
|
}
|
|
|
|
$lines = explode("\n", $get);
|
|
|
|
// Parse the file.
|
|
|
|
$sort = array();
|
|
$mb = array();
|
|
foreach ($lines as $line) {
|
|
$line = trim($line);
|
|
if ($line === '' || strncmp($line, '0x', 2) !== 0 || strpos($line, "*** NO MAPPING ***") !== false) {
|
|
continue;
|
|
}
|
|
$matches = array();
|
|
if (preg_match('/^0x([0-9A-F]{2,4})[ \t]+0x([0-9A-F]{4})[ \t].*$/', $line, $matches)) {
|
|
$d = hexdec($matches[1]);
|
|
if ($d < 0x80 && $d != 0x5C && $d != 0x7E) {
|
|
continue;
|
|
}
|
|
$u = hexdec($matches[2]);
|
|
// PUA characters (user-defined range), dealt with programatically by `u_sjis()`
|
|
// See CJKV Information Processing by Ken Lunde, 2nd ed., Table 4-86, p.286
|
|
// https://file.allitebooks.com/20160708/CJKV%20Information%20Processing.pdf
|
|
if ($u >= 0xE000 && $u <= 0xE757) {
|
|
continue;
|
|
}
|
|
$sort[] = $u;
|
|
$mb[] = $d;
|
|
}
|
|
}
|
|
|
|
array_multisort($sort, $mb);
|
|
|
|
// Calculate URO (U+4E00-U+9FFF) table
|
|
for ($u_i = 0, $cnt = count($sort); $u_i < $cnt && $sort[$u_i] < 0x4E00; $u_i++);
|
|
|
|
$start_u_i = $u_i;
|
|
$sjis_uro_u = $sjis_uro_mb_ind = array();
|
|
$sort_search = array_flip($sort);
|
|
for ($u = 0x4E00; $u <= 0x9FAF; $u += 16) {
|
|
$used = 0;
|
|
$next_u_i = $u_i;
|
|
for ($j = 0; $j < 16; $j++) {
|
|
if (isset($sort_search[$u + $j])) {
|
|
$i = $sort_search[$u + $j];
|
|
$used |= 1 << $j;
|
|
$next_u_i = $i + 1;
|
|
$end_u_i = $i;
|
|
}
|
|
}
|
|
$sjis_uro_u[] = $used;
|
|
$sjis_uro_mb_ind[] = $u_i;
|
|
$u_i = $next_u_i;
|
|
}
|
|
|
|
// Output URO tables
|
|
out_uro_tabs($out, 'sjis', $sjis_uro_u, $sjis_uro_mb_ind);
|
|
|
|
// Remove URO block from Unicode table
|
|
array_splice($sort, $start_u_i, $end_u_i - $start_u_i + 1);
|
|
|
|
// Output Shift JIS tables
|
|
out_tabs($out, 'sjis', $sort, $mb, true /*no_ind*/);
|
|
|
|
$out[] = '';
|
|
$out[] = '#endif /* Z_SJIS_H */';
|
|
|
|
file_put_contents($out_dirname . '/sjis.h', implode("\n", $out) . "\n");
|
|
|
|
// GB 2312
|
|
|
|
$out = array();
|
|
|
|
out_header($out, 'gb2312', 'GB 2312-1980 (EUC-CN)',
|
|
'unicode.org-mappings/EASTASIA/GB/GB2312.TXT', 2009,
|
|
'(see https://haible.de/bruno/charsets/conversion-tables/GB2312.tar.bz2)');
|
|
|
|
$file = $data_dirname . '/' . 'GB2312.TXT';
|
|
|
|
// Read the file.
|
|
|
|
if (($get = file_get_contents($file)) === false) {
|
|
error_log($error = "$basename: ERROR: Could not read mapping file \"$file\"");
|
|
exit($error . PHP_EOL);
|
|
}
|
|
|
|
$lines = explode("\n", $get);
|
|
|
|
// Parse the file.
|
|
|
|
$sort = array();
|
|
$mb = array();
|
|
$in_gb2312 = array();
|
|
foreach ($lines as $line) {
|
|
$line = trim($line);
|
|
if ($line === '' || strncmp($line, '0x', 2) !== 0 || strpos($line, "*** NO MAPPING ***") !== false) {
|
|
continue;
|
|
}
|
|
$matches = array();
|
|
if (preg_match('/^0x([0-9A-F]{2,4})[ \t]+0x([0-9A-F]{4})[ \t].*$/', $line, $matches)) {
|
|
$d = hexdec($matches[1]);
|
|
if ($d < 0x80) {
|
|
continue;
|
|
}
|
|
$u = hexdec($matches[2]);
|
|
$sort[] = $u;
|
|
$mb[] = $d + 0x8080; // Convert to EUC-CN
|
|
$in_gb2312[$u] = true;
|
|
}
|
|
}
|
|
|
|
array_multisort($sort, $mb);
|
|
|
|
// Calculate URO (U+4E00-U+9FFF) table
|
|
for ($u_i = 0, $cnt = count($sort); $u_i < $cnt && $sort[$u_i] < 0x4E00; $u_i++);
|
|
|
|
$start_u_i = $u_i;
|
|
$gb2312_uro_u = $gb2312_uro_mb_ind = array();
|
|
$sort_search = array_flip($sort);
|
|
for ($u = 0x4E00; $u <= 0x9CEF; $u += 16) {
|
|
$used = 0;
|
|
$next_u_i = $u_i;
|
|
for ($j = 0; $j < 16; $j++) {
|
|
if (isset($sort_search[$u + $j])) {
|
|
$i = $sort_search[$u + $j];
|
|
$used |= 1 << $j;
|
|
$next_u_i = $i + 1;
|
|
$end_u_i = $i;
|
|
}
|
|
}
|
|
$gb2312_uro_u[] = $used;
|
|
$gb2312_uro_mb_ind[] = $u_i;
|
|
$u_i = $next_u_i;
|
|
}
|
|
|
|
// Output URO tables
|
|
out_uro_tabs($out, 'gb2312', $gb2312_uro_u, $gb2312_uro_mb_ind);
|
|
|
|
// Remove URO block from Unicode table
|
|
array_splice($sort, $start_u_i, $end_u_i - $start_u_i + 1);
|
|
|
|
// Output GB 2312 tables
|
|
out_tabs($out, 'gb2312', $sort, $mb);
|
|
|
|
$out[] = '';
|
|
$out[] = '#endif /* Z_GB2312_H */';
|
|
|
|
file_put_contents($out_dirname . '/gb2312.h', implode("\n", $out) . "\n");
|
|
|
|
// GBK
|
|
|
|
$out = array();
|
|
|
|
out_header($out, 'gbk', 'GBK, excluding mappings in GB 2312',
|
|
'https://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP936.TXT');
|
|
|
|
$file = $data_dirname . '/' . 'CP936.TXT';
|
|
|
|
// Read the file.
|
|
|
|
if (($get = file_get_contents($file)) === false) {
|
|
error_log($error = "$basename: ERROR: Could not read mapping file \"$file\"");
|
|
exit($error . PHP_EOL);
|
|
}
|
|
|
|
$lines = explode("\n", $get);
|
|
|
|
// Parse the file.
|
|
|
|
$sort = array();
|
|
$mb = array();
|
|
$in_gbk = array();
|
|
foreach ($lines as $line) {
|
|
$line = trim($line);
|
|
if ($line === '' || strncmp($line, '0x', 2) !== 0 || strpos($line, "*** NO MAPPING ***") !== false) {
|
|
continue;
|
|
}
|
|
$matches = array();
|
|
if (preg_match('/^0x([0-9A-F]{2,4})[ \t]+0x([0-9A-F]{4})[ \t].*$/', $line, $matches)) {
|
|
$d = hexdec($matches[1]);
|
|
if ($d < 0x80) {
|
|
continue;
|
|
}
|
|
$u = hexdec($matches[2]);
|
|
$in_gbk[$u] = true;
|
|
if ($u != 0x2015 && isset($in_gb2312[$u])) { // U+2015 mapped differently by GBK
|
|
continue;
|
|
}
|
|
$sort[] = $u;
|
|
$mb[] = $d;
|
|
}
|
|
}
|
|
|
|
array_multisort($sort, $mb);
|
|
|
|
// Calculate URO (U+4E00-U+9FFF) table
|
|
for ($u_i = 0, $cnt = count($sort); $u_i < $cnt && $sort[$u_i] < 0x4E00; $u_i++);
|
|
|
|
$start_u_i = $u_i;
|
|
$gbk_uro_u = $gbk_uro_mb_ind = array();
|
|
$sort_search = array_flip($sort);
|
|
for ($u = 0x4E00; $u <= 0x9FAF; $u += 16) {
|
|
$used = 0;
|
|
$next_u_i = $u_i;
|
|
for ($j = 0; $j < 16; $j++) {
|
|
if (isset($sort_search[$u + $j])) {
|
|
$i = $sort_search[$u + $j];
|
|
$used |= 1 << $j;
|
|
$next_u_i = $i + 1;
|
|
$end_u_i = $i;
|
|
}
|
|
}
|
|
$gbk_uro_u[] = $used;
|
|
$gbk_uro_mb_ind[] = $u_i;
|
|
$u_i = $next_u_i;
|
|
}
|
|
|
|
// Output URO tables
|
|
out_uro_tabs($out, 'gbk', $gbk_uro_u, $gbk_uro_mb_ind);
|
|
|
|
// Remove URO block from Unicode table
|
|
array_splice($sort, $start_u_i, $end_u_i - $start_u_i + 1);
|
|
|
|
// Output GBK tables
|
|
out_tabs($out, 'gbk', $sort, $mb, true /*no_ind*/);
|
|
|
|
$out[] = '';
|
|
$out[] = '#endif /* Z_GBK_H */';
|
|
|
|
file_put_contents($out_dirname . '/gbk.h', implode("\n", $out) . "\n");
|
|
|
|
// GB 18030
|
|
|
|
$out = array();
|
|
|
|
out_header($out, 'gb18030', 'GB 18030-2005', 'jdk-1.4.2/GB18030.TXT', 2016,
|
|
'(see https://haible.de/bruno/charsets/conversion-tables/GB18030.tar.bz2)');
|
|
|
|
$file = $data_dirname . '/' . 'GB18030.TXT';
|
|
|
|
// Read the file.
|
|
|
|
if (($get = file_get_contents($file)) === false) {
|
|
error_log($error = "$basename: ERROR: Could not read mapping file \"$file\"");
|
|
exit($error . PHP_EOL);
|
|
}
|
|
|
|
$lines = explode("\n", $get);
|
|
|
|
// Parse the file.
|
|
|
|
$sort2 = array();
|
|
$mb2 = array();
|
|
$sort4 = array();
|
|
$mb4 = array();
|
|
|
|
foreach ($lines as $line) {
|
|
$line = trim($line);
|
|
if ($line === '' || strncmp($line, '0x', 2) !== 0 || strpos($line, "*** NO MAPPING ***") !== false) {
|
|
continue;
|
|
}
|
|
if (preg_match('/^0x([0-9A-F]{2,8})[ \t]+0x([0-9A-F]{5})/', $line)) { // Exclude U+10000..10FFFF to save space
|
|
continue;
|
|
}
|
|
$matches = array();
|
|
if (preg_match('/^0x([0-9A-F]{2,8})[ \t]+0x([0-9A-F]{4}).*$/', $line, $matches)) {
|
|
$d = hexdec($matches[1]);
|
|
if ($d < 0x80) {
|
|
continue;
|
|
}
|
|
$u = hexdec($matches[2]);
|
|
// 2-byte extension GB 18030-2005 change, were PUA, see Table 3-37, p.109, Lunde 2nd ed.
|
|
if (($u >= 0x9FB4 && $u <= 0x9FBB) || ($u >= 0xFE10 && $u <= 0xFE19)) {
|
|
//continue;
|
|
}
|
|
// 4-byte extension change, PUA
|
|
if ($u == 0xE7C7) {
|
|
continue;
|
|
}
|
|
if ($d < 0x10000) {
|
|
if (isset($in_gbk[$u])) {
|
|
continue;
|
|
}
|
|
// User-defined, dealt with programatically by `u_gb18030()`
|
|
if ($u >= 0xE000 && $u <= 0xE765) {
|
|
continue;
|
|
}
|
|
$sort2[] = $u;
|
|
$mb2[] = $d;
|
|
} else if ($u < 0x10000) {
|
|
$sort4[] = $u;
|
|
$mb4[] = $d;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* 2-byte extension GB 18030-2005 change, was PUA U+E7C7 below, see Table 3-39, p.111, Lunde 2nd ed. */
|
|
$sort2[] = 0x1E3F; $mb2[] = 0xA8BC;
|
|
|
|
/* 2-byte extension GB 18030-2005 change, were PUA, see Table 3-37, p.109, Lunde 2nd ed. */
|
|
$sort2[] = 0x9FB4; $mb2[] = 0xFE59;
|
|
$sort2[] = 0x9FB5; $mb2[] = 0xFE61;
|
|
$sort2[] = 0x9FB6; $mb2[] = 0xFE66;
|
|
$sort2[] = 0x9FB7; $mb2[] = 0xFE67;
|
|
$sort2[] = 0x9FB8; $mb2[] = 0xFE6D;
|
|
$sort2[] = 0x9FB9; $mb2[] = 0xFE7E;
|
|
$sort2[] = 0x9FBA; $mb2[] = 0xFE90;
|
|
$sort2[] = 0x9FBB; $mb2[] = 0xFEA0;
|
|
|
|
$sort2[] = 0xFE10; $mb2[] = 0xA6D9;
|
|
$sort2[] = 0xFE11; $mb2[] = 0xA6DB;
|
|
$sort2[] = 0xFE12; $mb2[] = 0xA6DA;
|
|
$sort2[] = 0xFE13; $mb2[] = 0xA6DC;
|
|
$sort2[] = 0xFE14; $mb2[] = 0xA6DD;
|
|
$sort2[] = 0xFE15; $mb2[] = 0xA6DE;
|
|
$sort2[] = 0xFE16; $mb2[] = 0xA6DF;
|
|
$sort2[] = 0xFE17; $mb2[] = 0xA6EC;
|
|
$sort2[] = 0xFE18; $mb2[] = 0xA6ED;
|
|
$sort2[] = 0xFE19; $mb2[] = 0xA6F3;
|
|
|
|
/* 4-byte extension PUA */
|
|
// Dealt with by `u_gb18030()`
|
|
//$sort4[] = 0xE7C7;
|
|
//$mb4[] = 0x8135F437;
|
|
|
|
// Calculate Unicode start/end codepoints mapping to consecutive 4-byte blocks
|
|
|
|
array_multisort($sort4, $mb4);
|
|
|
|
$gb18030_4_u_b = array();
|
|
$gb18030_4_u_e = array();
|
|
$gb18030_4_mb_o = array();
|
|
|
|
// Start/end points
|
|
$prev_u = $begin_u = $sort4[0];
|
|
for ($i = 1, $cnt = count($sort4); $i < $cnt; $i++) {
|
|
$u = $sort4[$i];
|
|
if ($u === $prev_u + 1) {
|
|
$prev_u++;
|
|
continue;
|
|
}
|
|
$gb18030_4_u_b[] = $begin_u;
|
|
$gb18030_4_u_e[] = $prev_u;
|
|
$begin_u = $prev_u = $u;
|
|
}
|
|
$gb18030_4_u_b[] = $begin_u;
|
|
$gb18030_4_u_e[] = $prev_u;
|
|
|
|
// Gaps between blocks
|
|
$gb18030_4_mb_o[] = 0;
|
|
for ($i = 1, $cnt = count($gb18030_4_u_b); $i < $cnt; $i++) {
|
|
$gb18030_4_mb_o[] = $gb18030_4_u_b[$i] - ($gb18030_4_u_e[$i - 1] + 1) + $gb18030_4_mb_o[count($gb18030_4_mb_o) - 1];
|
|
}
|
|
|
|
// Output GB 18030 tables
|
|
|
|
array_multisort($sort2, $mb2);
|
|
out_tabs($out, 'gb18030_2', $sort2, $mb2, true /*no_ind*/);
|
|
|
|
// Start codepoints `gb18030_4_u_b` array not needed by `u_gb18030()`
|
|
$cnt = count($gb18030_4_u_e);
|
|
$out[] = '';
|
|
$out[] = '/* End Unicode codepoints of blocks mapping consecutively to 4-byte multibyte blocks */';
|
|
$out[] = 'static const unsigned short gb18030_4_u_e[' . $cnt .'] = {';
|
|
out_tab_entries($out, $gb18030_4_u_e, $cnt);
|
|
$out[] = '};';
|
|
$cnt = count($gb18030_4_mb_o);
|
|
$out[] = '';
|
|
$out[] = '/* Cumulative gaps between Unicode blocks mapping consecutively to 4-byte multibyte blocks,';
|
|
$out[] = ' used to adjust multibyte offsets */';
|
|
$out[] = 'static const unsigned short gb18030_4_mb_o[' . $cnt .'] = {';
|
|
out_tab_entries($out, $gb18030_4_mb_o, $cnt, true /*not_hex*/);
|
|
$out[] = '};';
|
|
|
|
$out[] = '';
|
|
$out[] = '#endif /* Z_GB18030_H */';
|
|
|
|
file_put_contents($out_dirname . '/gb18030.h', implode("\n", $out) . "\n");
|
|
|
|
/* vim: set ts=4 sw=4 et : */
|