zint/backend/tools/gen_eci_mb_h.php
gitlost ab3cf4f395 eci.c: replace libiconv-adapted code with own implementations so
as to fully comply with BSD license (for why Library GPL 2+ not
  compatible see https://opensource.stackexchange.com/a/6701),
  ~3% slower (maybe), +~6K extra in data
  (gb18030.c, gb2313.c & sjis.c removed, mapping .TXT files moved
  to backend/tools/data & extra ones added, 2 new PHP generators)
GUI: CODE39/EXCODE39: show/hide HIBC check digit option in addition
  to enable/disable (less confusing)
CLI: batch: pedantic check for EOF using intChar in newline fgetc()
  loop
test_args.c: don't use WIFEXITED(), WEXITSTATUS() on Windows
manual: lessen some copy/paste verbiage by referring back, other
  small tweaks/typos
2022-06-02 20:32:25 +01:00

705 lines
21 KiB
PHP

<?php
/* Generate ECI multibyte tables from unicode.org mapping files */
/*
libzint - the open source barcode library
Copyright (C) 2022 Robin Stuart <rstuart114@gmail.com>
*/
/*
* To create "backend/eci_mb.h" (from project root directory):
*
* php backend/tools/gen_eci_mb_h.php
*
* NOTE: backend/tools/data/GB18030.TXT will have to be downloaded first from the tarball
* https://haible.de/bruno/charsets/conversion-tables/GB18030.tar.bz2
* using the version jdk-1.4.2/GB18030.TXT
*/
// 'zint.assertions' should set to 1 in php.ini
$copyright_text = <<<'EOD'
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the name of the project nor the names of its contributors
may be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
SUCH DAMAGE.
*/
EOD;
$basename = basename(__FILE__);
$dirname = dirname(__FILE__);
$opts = getopt('d:o:');
$data_dirname = isset($opts['d']) ? $opts['d'] : ($dirname . '/data'); // Where to load file from.
$out_dirname = isset($opts['o']) ? $opts['o'] : ($dirname . '/..'); // Where to put output.
$year = 2022;
function out_header(&$out, $name, $descr, $file, $start_year = 0, $extra_comment = '') {
global $copyright_text, $basename, $year;
$caps = strtoupper($name);
$out[] = '/* ' . $name . '.h - tables for Unicode to ' . $descr . ',';
$out[] = ' ' . 'generated by "backend/tools/' . $basename . '" from';
if ($extra_comment !== '') {
$out[] = ' "' . $file . '"';
$out[] = ' ' . $extra_comment . ' */';
} else {
$out[] = ' "' . $file . '" */';
}
$out[] = '/*';
$out[] = ' libzint - the open source barcode library';
if ($start_year && $start_year != $year) {
$out[] = ' Copyright (C) ' . $start_year . '-' . $year . ' Robin Stuart <rstuart114@gmail.com>';
} else {
$out[] = ' Copyright (C) ' . $year . ' Robin Stuart <rstuart114@gmail.com>';
}
$out = array_merge($out, explode("\n", $copyright_text));
$out[] = '#ifndef Z_' . $caps . '_H';
$out[] = '#define Z_' . $caps . '_H';
}
/* Output a block of table entries to `$out` array */
function out_tab_entries(&$out, $arr, $cnt, $not_hex = false) {
$line = ' ';
for ($i = 0; $i < $cnt; $i++) {
if ($i && $i % 8 === 0) {
$out[] = $line;
$line = ' ';
}
if ($not_hex) {
$line .= sprintf(' %5d,', $arr[$i]);
} else {
$line .= sprintf(' 0x%04X,', $arr[$i]);
}
}
if ($line !== ' ') {
$out[] = $line;
}
}
/* Output tables to `$out` array */
function out_tabs(&$out, $name, $sort, $mb, $no_u_ind = false, $u_comment = '', $mb_comment = '') {
if ($u_comment == '') $u_comment = 'Unicode codepoints sorted';
$cnt_sort = count($sort);
$out[] = '';
$out[] = '/* ' . $u_comment . ' */';
$out[] = 'static const unsigned short ' . $name . '_u[' . $cnt_sort . '] = {';
out_tab_entries($out, $sort, $cnt_sort);
$out[] = '};';
if (!empty($mb)) {
if ($mb_comment == '') $mb_comment = 'Multibyte values sorted in Unicode order';
$cnt = count($mb);
$out[] = '';
$out[] = '/* ' . $mb_comment . ' */';
$out[] = 'static const unsigned short ' . $name . '_mb[' . $cnt . '] = {';
$line = ' ';
out_tab_entries($out, $mb, $cnt);
$out[] = '};';
}
if (!$no_u_ind) {
$ind_cnt = ($sort[$cnt_sort - 1] >> 8) + 1;
$out[] = '';
$out[] = '/* Indexes into Unicode `' . $name . '_u[]` array in blocks of 0x100 */';
$ind_idx = count($out);
$out[] = 'static const unsigned short ' . $name . '_u_ind[] = {';
$line = ' ';
$i = 0;
foreach ($sort as $ind => $u) {
$div = ($u - $sort[0]) >> 8;
while ($div >= $i) {
if ($i && $i % 8 === 0) {
$out[] = $line;
$line = ' ';
}
$line .= sprintf(' %5d,', $ind);
$i++;
}
}
if ($line !== ' ') {
$out[] = $line;
$line = ' ';
}
$out[] = '};';
$out[$ind_idx] = 'static const unsigned short ' . $name . '_u_ind[' . $i . '] = {';
}
}
/* Helper to output special-case URO (Unified Repertoire and Ordering) block (U+4E00-U+9FFF) tables */
function out_uro_tabs(&$out, $name, $tab_uro_u, $tab_uro_mb_ind) {
$cnt = count($tab_uro_u);
$out[] = '';
$out[] = '/* Unicode usage bit-flags for URO (U+4E00-U+9FFF) block */';
$out[] = 'static const unsigned short ' . $name . '_uro_u[' . $cnt . '] = {';
out_tab_entries($out, $tab_uro_u, $cnt);
$cnt = count($tab_uro_mb_ind);
$out[] = '};';
$out[] = '';
$out[] = '/* Multibyte indexes for URO (U+4E00-U+9FFF) block */';
$out[] = 'static const unsigned short ' . $name . '_uro_mb_ind[' . $cnt . '] = {';
out_tab_entries($out, $tab_uro_mb_ind, $cnt, true /*not_hex*/);
$out[] = '};';
}
// BIG5
$out = array();
out_header($out, 'big5', 'Big5', 'https://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/BIG5.TXT', 2021);
$file = $data_dirname . '/' . 'BIG5.TXT';
// Read the file.
if (($get = file_get_contents($file)) === false) {
error_log($error = "$basename: ERROR: Could not read mapping file \"$file\"");
exit($error . PHP_EOL);
}
$lines = explode("\n", $get);
// Parse the file.
$sort = array();
$mb = array();
foreach ($lines as $line) {
$line = trim($line);
if ($line === '' || strncmp($line, '0x', 2) !== 0 || strpos($line, "*** NO MAPPING ***") !== false) {
continue;
}
$matches = array();
if (preg_match('/^0x([0-9A-F]{4})[ \t]+0x([0-9A-F]{4})[ \t].*$/', $line, $matches)) {
$d = hexdec($matches[1]);
$u = hexdec($matches[2]);
$sort[] = $u;
$mb[] = $d;
}
}
array_multisort($sort, $mb);
// Calculate URO (U+4E00-U+9FFF) table
for ($u_i = 0, $cnt = count($sort); $u_i < $cnt && $sort[$u_i] < 0x4E00; $u_i++);
$start_u_i = $u_i;
$big5_uro_u = $big5_uro_mb_ind = array();
for ($u = 0x4E00; $u <= 0x9FAF; $u += 16) {
$used = 0;
$next_u_i = $u_i;
for ($j = 0; $j < 16; $j++) {
if (($i = array_search($u + $j, $sort)) !== false) {
$used |= 1 << $j;
$next_u_i = $i + 1;
$end_u_i = $i;
}
}
$big5_uro_u[] = $used;
$big5_uro_mb_ind[] = $u_i;
$u_i = $next_u_i;
}
// Output URO tables
out_uro_tabs($out, 'big5', $big5_uro_u, $big5_uro_mb_ind);
// Remove URO block from Unicode table
array_splice($sort, $start_u_i, $end_u_i - $start_u_i + 1);
// Output Big5 tables
out_tabs($out, 'big5', $sort, $mb, true /*no_ind*/);
$out[] = '';
$out[] = '#endif /* Z_BIG5_H */';
file_put_contents($out_dirname . '/big5.h', implode("\n", $out) . "\n");
// EUC-KR (KS X 1001)
$out = array();
out_header($out, 'ksx1001', 'EUC-KR (KS X 1001)',
'https://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/KSC/KSX1001.TXT', 2021);
$file = $data_dirname . '/' . 'KSX1001.TXT';
// Read the file.
if (($get = file_get_contents($file)) === false) {
error_log($error = "$basename: ERROR: Could not read mapping file \"$file\"");
exit($error . PHP_EOL);
}
$lines = explode("\n", $get);
// Parse the file.
$sort = array();
$mb = array();
foreach ($lines as $line) {
$line = trim($line);
if ($line === '' || strncmp($line, '0x', 2) !== 0 || strpos($line, "*** NO MAPPING ***") !== false) {
continue;
}
$matches = array();
if (preg_match('/^0x([0-9A-F]{4})[ \t]+0x([0-9A-F]{4})[ \t].*$/', $line, $matches)) {
$d = hexdec($matches[1]) + 0x8080; // Convert to EUC-KR
$u = hexdec($matches[2]);
$sort[] = $u;
$mb[] = $d;
}
}
// Add some characters defined later than in KSX1001.TXT
$sort[] = 0x20AC; // Euro sign added KS X 1001:1998
$mb[] = 0x2266 + 0x8080;
$sort[] = 0xAE; // Registered trademark added KS X 1001:1998
$mb[] = 0x2267 + 0x8080;
$sort[] = 0x327E; // Korean postal code symbol added KS X 1001:2002
$mb[]= 0x2268 + 0x8080;
array_multisort($sort, $mb);
// Calculate URO (U+4E00-U+9FFF) table
for ($u_i = 0, $cnt = count($sort); $u_i < $cnt && $sort[$u_i] < 0x4E00; $u_i++);
$start_u_i = $u_i;
$ksx1001_uro_u = $ksx1001_uro_mb_ind = array();
for ($u = 0x4E00; $u <= 0x9F9F; $u += 16) {
$used = 0;
$next_u_i = $u_i;
for ($j = 0; $j < 16; $j++) {
if (($i = array_search($u + $j, $sort)) !== false) {
$used |= 1 << $j;
$next_u_i = $i + 1;
$end_u_i = $i;
}
}
$ksx1001_uro_u[] = $used;
$ksx1001_uro_mb_ind[] = $u_i;
$u_i = $next_u_i;
}
// Output URO tables
out_uro_tabs($out, 'ksx1001', $ksx1001_uro_u, $ksx1001_uro_mb_ind);
// Remove URO block from Unicode table
array_splice($sort, $start_u_i, $end_u_i - $start_u_i + 1);
// Output KS X 1001 tables
out_tabs($out, 'ksx1001', $sort, $mb);
$out[] = '';
$out[] = '#endif /* Z_KSX1001_H */';
file_put_contents($out_dirname . '/ksx1001.h', implode("\n", $out) . "\n");
// Shift JIS
$out = array();
out_header($out, 'sjis', 'Shift JIS', 'https://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/SHIFTJIS.TXT', 2009);
$file = $data_dirname . '/' . 'SHIFTJIS.TXT';
// Read the file.
if (($get = file_get_contents($file)) === false) {
error_log($error = "$basename: ERROR: Could not read mapping file \"$file\"");
exit($error . PHP_EOL);
}
$lines = explode("\n", $get);
// Parse the file.
$sort = array();
$mb = array();
foreach ($lines as $line) {
$line = trim($line);
if ($line === '' || strncmp($line, '0x', 2) !== 0 || strpos($line, "*** NO MAPPING ***") !== false) {
continue;
}
$matches = array();
if (preg_match('/^0x([0-9A-F]{2,4})[ \t]+0x([0-9A-F]{4})[ \t].*$/', $line, $matches)) {
$d = hexdec($matches[1]);
if ($d < 0x80 && $d != 0x5C && $d != 0x7E) {
continue;
}
$u = hexdec($matches[2]);
// PUA characters (user-defined range), dealt with programatically by `u_sjis()`
// See CJKV Information Processing by Ken Lunde, 2nd ed., Table 4-86, p.286
// https://file.allitebooks.com/20160708/CJKV%20Information%20Processing.pdf
if ($u >= 0xE000 && $u <= 0xE757) {
continue;
}
$sort[] = $u;
$mb[] = $d;
}
}
array_multisort($sort, $mb);
// Calculate URO (U+4E00-U+9FFF) table
for ($u_i = 0, $cnt = count($sort); $u_i < $cnt && $sort[$u_i] < 0x4E00; $u_i++);
$start_u_i = $u_i;
$sjis_uro_u = $sjis_uro_mb_ind = array();
for ($u = 0x4E00; $u <= 0x9FAF; $u += 16) {
$used = 0;
$next_u_i = $u_i;
for ($j = 0; $j < 16; $j++) {
if (($i = array_search($u + $j, $sort)) !== false) {
$used |= 1 << $j;
$next_u_i = $i + 1;
$end_u_i = $i;
}
}
$sjis_uro_u[] = $used;
$sjis_uro_mb_ind[] = $u_i;
$u_i = $next_u_i;
}
// Output URO tables
out_uro_tabs($out, 'sjis', $sjis_uro_u, $sjis_uro_mb_ind);
// Remove URO block from Unicode table
array_splice($sort, $start_u_i, $end_u_i - $start_u_i + 1);
// Output Shift JIS tables
out_tabs($out, 'sjis', $sort, $mb, true /*no_ind*/);
$out[] = '';
$out[] = '#endif /* Z_SJIS_H */';
file_put_contents($out_dirname . '/sjis.h', implode("\n", $out) . "\n");
// GB 2312
$out = array();
out_header($out, 'gb2312', 'GB 2312-1980 (EUC-CN)',
'unicode.org-mappings/EASTASIA/GB/GB2312.TXT', 2009,
'(see https://haible.de/bruno/charsets/conversion-tables/GB2312.tar.bz2)');
$file = $data_dirname . '/' . 'GB2312.TXT';
// Read the file.
if (($get = file_get_contents($file)) === false) {
error_log($error = "$basename: ERROR: Could not read mapping file \"$file\"");
exit($error . PHP_EOL);
}
$lines = explode("\n", $get);
// Parse the file.
$sort = array();
$mb = array();
$in_gb2312 = array();
foreach ($lines as $line) {
$line = trim($line);
if ($line === '' || strncmp($line, '0x', 2) !== 0 || strpos($line, "*** NO MAPPING ***") !== false) {
continue;
}
$matches = array();
if (preg_match('/^0x([0-9A-F]{2,4})[ \t]+0x([0-9A-F]{4})[ \t].*$/', $line, $matches)) {
$d = hexdec($matches[1]);
if ($d < 0x80) {
continue;
}
$u = hexdec($matches[2]);
$sort[] = $u;
$mb[] = $d + 0x8080; // Convert to EUC-CN
$in_gb2312[$u] = true;
}
}
array_multisort($sort, $mb);
// Calculate URO (U+4E00-U+9FFF) table
for ($u_i = 0, $cnt = count($sort); $u_i < $cnt && $sort[$u_i] < 0x4E00; $u_i++);
$start_u_i = $u_i;
$gb2312_uro_u = $gb2312_uro_mb_ind = array();
for ($u = 0x4E00; $u <= 0x9CEF; $u += 16) {
$used = 0;
$next_u_i = $u_i;
for ($j = 0; $j < 16; $j++) {
if (($i = array_search($u + $j, $sort)) !== false) {
$used |= 1 << $j;
$next_u_i = $i + 1;
$end_u_i = $i;
}
}
$gb2312_uro_u[] = $used;
$gb2312_uro_mb_ind[] = $u_i;
$u_i = $next_u_i;
}
// Output URO tables
out_uro_tabs($out, 'gb2312', $gb2312_uro_u, $gb2312_uro_mb_ind);
// Remove URO block from Unicode table
array_splice($sort, $start_u_i, $end_u_i - $start_u_i + 1);
// Output GB 2312 tables
out_tabs($out, 'gb2312', $sort, $mb);
$out[] = '';
$out[] = '#endif /* Z_GB2312_H */';
file_put_contents($out_dirname . '/gb2312.h', implode("\n", $out) . "\n");
// GBK
$out = array();
out_header($out, 'gbk', 'GBK, excluding mappings in GB 2312',
'https://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP936.TXT');
$file = $data_dirname . '/' . 'CP936.TXT';
// Read the file.
if (($get = file_get_contents($file)) === false) {
error_log($error = "$basename: ERROR: Could not read mapping file \"$file\"");
exit($error . PHP_EOL);
}
$lines = explode("\n", $get);
// Parse the file.
$sort = array();
$mb = array();
$in_gbk = array();
foreach ($lines as $line) {
$line = trim($line);
if ($line === '' || strncmp($line, '0x', 2) !== 0 || strpos($line, "*** NO MAPPING ***") !== false) {
continue;
}
$matches = array();
if (preg_match('/^0x([0-9A-F]{2,4})[ \t]+0x([0-9A-F]{4})[ \t].*$/', $line, $matches)) {
$d = hexdec($matches[1]);
if ($d < 0x80) {
continue;
}
$u = hexdec($matches[2]);
$in_gbk[$u] = true;
if ($u != 0x2015 && isset($in_gb2312[$u])) { // U+2015 mapped differently by GBK
continue;
}
$sort[] = $u;
$mb[] = $d;
}
}
array_multisort($sort, $mb);
// Calculate URO (U+4E00-U+9FFF) table
for ($u_i = 0, $cnt = count($sort); $u_i < $cnt && $sort[$u_i] < 0x4E00; $u_i++);
$start_u_i = $u_i;
$gbk_uro_u = $gbk_uro_mb_ind = array();
for ($u = 0x4E00; $u <= 0x9FAF; $u += 16) {
$used = 0;
$next_u_i = $u_i;
for ($j = 0; $j < 16; $j++) {
if (($i = array_search($u + $j, $sort)) !== false) {
$used |= 1 << $j;
$next_u_i = $i + 1;
$end_u_i = $i;
}
}
$gbk_uro_u[] = $used;
$gbk_uro_mb_ind[] = $u_i;
$u_i = $next_u_i;
}
// Output URO tables
out_uro_tabs($out, 'gbk', $gbk_uro_u, $gbk_uro_mb_ind);
// Remove URO block from Unicode table
array_splice($sort, $start_u_i, $end_u_i - $start_u_i + 1);
// Output GBK tables
out_tabs($out, 'gbk', $sort, $mb, true /*no_ind*/);
$out[] = '';
$out[] = '#endif /* Z_GBK_H */';
file_put_contents($out_dirname . '/gbk.h', implode("\n", $out) . "\n");
// GB 18030
$out = array();
out_header($out, 'gb18030', 'GB 18030-2005', 'jdk-1.4.2/GB18030.TXT', 2016,
'(see https://haible.de/bruno/charsets/conversion-tables/GB18030.tar.bz2)');
$file = $data_dirname . '/' . 'GB18030.TXT';
// Read the file.
if (($get = file_get_contents($file)) === false) {
error_log($error = "$basename: ERROR: Could not read mapping file \"$file\"");
exit($error . PHP_EOL);
}
$lines = explode("\n", $get);
// Parse the file.
$sort2 = array();
$mb2 = array();
$sort4 = array();
$mb4 = array();
foreach ($lines as $line) {
$line = trim($line);
if ($line === '' || strncmp($line, '0x', 2) !== 0 || strpos($line, "*** NO MAPPING ***") !== false) {
continue;
}
if (preg_match('/^0x([0-9A-F]{2,8})[ \t]+0x([0-9A-F]{5})/', $line)) { // Exclude U+10000..10FFFF to save space
continue;
}
$matches = array();
if (preg_match('/^0x([0-9A-F]{2,8})[ \t]+0x([0-9A-F]{4}).*$/', $line, $matches)) {
$d = hexdec($matches[1]);
if ($d < 0x80) {
continue;
}
$u = hexdec($matches[2]);
// 2-byte extension GB 18030-2005 change, were PUA, see Table 3-37, p.109, Lunde 2nd ed.
if (($u >= 0x9FB4 && $u <= 0x9FBB) || ($u >= 0xFE10 && $u <= 0xFE19)) {
//continue;
}
// 4-byte extension change, PUA
if ($u == 0xE7C7) {
continue;
}
if ($d < 0x10000) {
if (isset($in_gbk[$u])) {
continue;
}
// User-defined, dealt with programatically by `u_gb18030()`
if ($u >= 0xE000 && $u <= 0xE765) {
continue;
}
$sort2[] = $u;
$mb2[] = $d;
} else if ($u < 0x10000) {
$sort4[] = $u;
$mb4[] = $d;
}
}
}
/* 2-byte extension GB 18030-2005 change, was PUA U+E7C7 below, see Table 3-39, p.111, Lunde 2nd ed. */
$sort2[] = 0x1E3F; $mb2[] = 0xA8BC;
/* 2-byte extension GB 18030-2005 change, were PUA, see Table 3-37, p.109, Lunde 2nd ed. */
$sort2[] = 0x9FB4; $mb2[] = 0xFE59;
$sort2[] = 0x9FB5; $mb2[] = 0xFE61;
$sort2[] = 0x9FB6; $mb2[] = 0xFE66;
$sort2[] = 0x9FB7; $mb2[] = 0xFE67;
$sort2[] = 0x9FB8; $mb2[] = 0xFE6D;
$sort2[] = 0x9FB9; $mb2[] = 0xFE7E;
$sort2[] = 0x9FBA; $mb2[] = 0xFE90;
$sort2[] = 0x9FBB; $mb2[] = 0xFEA0;
$sort2[] = 0xFE10; $mb2[] = 0xA6D9;
$sort2[] = 0xFE11; $mb2[] = 0xA6DB;
$sort2[] = 0xFE12; $mb2[] = 0xA6DA;
$sort2[] = 0xFE13; $mb2[] = 0xA6DC;
$sort2[] = 0xFE14; $mb2[] = 0xA6DD;
$sort2[] = 0xFE15; $mb2[] = 0xA6DE;
$sort2[] = 0xFE16; $mb2[] = 0xA6DF;
$sort2[] = 0xFE17; $mb2[] = 0xA6EC;
$sort2[] = 0xFE18; $mb2[] = 0xA6ED;
$sort2[] = 0xFE19; $mb2[] = 0xA6F3;
/* 4-byte extension PUA */
// Dealt with by `u_gb18030()`
//$sort4[] = 0xE7C7;
//$mb4[] = 0x8135F437;
// Calculate Unicode start/end codepoints mapping to consecutive 4-byte blocks
array_multisort($sort4, $mb4);
$gb18030_4_u_b = array();
$gb18030_4_u_e = array();
$gb18030_4_mb_o = array();
// Start/end points
$prev_u = $begin_u = $sort4[0];
for ($i = 1, $cnt = count($sort4); $i < $cnt; $i++) {
$u = $sort4[$i];
if ($u === $prev_u + 1) {
$prev_u++;
continue;
}
$gb18030_4_u_b[] = $begin_u;
$gb18030_4_u_e[] = $prev_u;
$begin_u = $prev_u = $u;
}
$gb18030_4_u_b[] = $begin_u;
$gb18030_4_u_e[] = $prev_u;
// Gaps between blocks
$gb18030_4_mb_o[] = 0;
for ($i = 1, $cnt = count($gb18030_4_u_b); $i < $cnt; $i++) {
$gb18030_4_mb_o[] = $gb18030_4_u_b[$i] - ($gb18030_4_u_e[$i - 1] + 1) + $gb18030_4_mb_o[count($gb18030_4_mb_o) - 1];
}
// Output GB 18030 tables
array_multisort($sort2, $mb2);
out_tabs($out, 'gb18030_2', $sort2, $mb2, true /*no_ind*/);
// Start codepoints `gb18030_4_u_b` array not needed by `u_gb18030()`
$cnt = count($gb18030_4_u_e);
$out[] = '';
$out[] = '/* End Unicode codepoints of blocks mapping consecutively to 4-byte multibyte blocks */';
$out[] = 'static const unsigned short gb18030_4_u_e[' . $cnt .'] = {';
out_tab_entries($out, $gb18030_4_u_e, $cnt);
$out[] = '};';
$cnt = count($gb18030_4_mb_o);
$out[] = '';
$out[] = '/* Cumulative gaps between Unicode blocks mapping consecutively to 4-byte multibyte blocks,';
$out[] = ' used to adjust multibyte offsets */';
$out[] = 'static const unsigned short gb18030_4_mb_o[' . $cnt .'] = {';
out_tab_entries($out, $gb18030_4_mb_o, $cnt, true /*not_hex*/);
$out[] = '};';
$out[] = '';
$out[] = '#endif /* Z_GB18030_H */';
file_put_contents($out_dirname . '/gb18030.h', implode("\n", $out) . "\n");
/* vim: set ts=4 sw=4 et : */