Skip to content

Commit 85e761e

Browse files
Fix decomposition
1 parent c6a142f commit 85e761e

3 files changed

Lines changed: 39 additions & 13 deletions

File tree

Lib/test/test_unicodedata.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,12 @@ def test_decomposition(self):
378378
# New in 17.0.0
379379
self.assertEqual(self.db.decomposition('\uA7F1'), '' if self.old else '<super> 0053')
380380

381+
# Hangul characters
382+
self.assertEqual(self.db.decomposition('\uAC00'), '1100 1161')
383+
self.assertEqual(self.db.decomposition('\uAC01'), '1100 1161 11A8')
384+
self.assertEqual(self.db.decomposition('\uC2F8'), '110A 1161')
385+
self.assertEqual(self.db.decomposition('\uD7A3'), '1112 1175 11C2')
386+
381387
self.assertRaises(TypeError, self.db.decomposition)
382388
self.assertRaises(TypeError, self.db.decomposition, 'xx')
383389

@@ -689,7 +695,7 @@ class UnicodeFunctionsTest(unittest.TestCase, BaseUnicodeFunctionsTest):
689695
# (e.g. 'make distclean && make') to get the correct checksum.
690696
expectedchecksum = ('668dbbea1136e69d4f00677a5988b23bc78aefc6'
691697
if quicktest else
692-
'b869af769bd8fe352c04622ab90533dc54df5cf3')
698+
'ebfc9dd281c2226998fd435744dd2e9321899beb')
693699

694700
@requires_resource('network')
695701
def test_all_names(self):
@@ -979,7 +985,7 @@ class Unicode_3_2_0_FunctionsTest(unittest.TestCase, BaseUnicodeFunctionsTest):
979985
old = True
980986
expectedchecksum = ('2164a66700e03cba9c9f5ed9e9a8d594d2da136a'
981987
if quicktest else
982-
'a8276cec9b6991779c5bdaa46c1ae7cc50bc2403')
988+
'74936dffe949d99203a47e6a66565b2fc337bae7')
983989

984990

985991
class UnicodeMiscTest(unittest.TestCase):
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix :func:`unicodedata.decomposition` for Hangul characters.

Modules/unicodedata.c

Lines changed: 30 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -429,6 +429,17 @@ unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
429429
return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
430430
}
431431

432+
// For Hangul decomposition
433+
#define SBase 0xAC00
434+
#define LBase 0x1100
435+
#define VBase 0x1161
436+
#define TBase 0x11A7
437+
#define LCount 19
438+
#define VCount 21
439+
#define TCount 28
440+
#define NCount (VCount*TCount)
441+
#define SCount (LCount*NCount)
442+
432443
/*[clinic input]
433444
@permit_long_summary
434445
unicodedata.UCD.decomposition
@@ -460,6 +471,23 @@ unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
460471
return Py_GetConstant(Py_CONSTANT_EMPTY_STR); /* unassigned */
461472
}
462473

474+
// Hangul Decomposition.
475+
// See https://www.unicode.org/versions/latest/core-spec/chapter-3/#G56669
476+
if (SBase <= code && code < (SBase + SCount)) {
477+
int SIndex = code - SBase;
478+
int L = LBase + SIndex / NCount;
479+
int V = VBase + (SIndex % NCount) / TCount;
480+
int T = TBase + SIndex % TCount;
481+
if (T != TBase) {
482+
PyOS_snprintf(decomp, sizeof(decomp),
483+
"%04X %04X %04X", L, V, T);
484+
} else {
485+
PyOS_snprintf(decomp, sizeof(decomp),
486+
"%04X %04X", L, V);
487+
}
488+
return PyUnicode_FromString(decomp);
489+
}
490+
463491
if (code < 0 || code >= 0x110000)
464492
index = 0;
465493
else {
@@ -522,16 +550,6 @@ get_decomp_record(PyObject *self, Py_UCS4 code,
522550
(*index)++;
523551
}
524552

525-
#define SBase 0xAC00
526-
#define LBase 0x1100
527-
#define VBase 0x1161
528-
#define TBase 0x11A7
529-
#define LCount 19
530-
#define VCount 21
531-
#define TCount 28
532-
#define NCount (VCount*TCount)
533-
#define SCount (LCount*NCount)
534-
535553
static PyObject*
536554
nfd_nfkd(PyObject *self, PyObject *input, int k)
537555
{
@@ -585,7 +603,8 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
585603
}
586604
output = new_output;
587605
}
588-
/* Hangul Decomposition. */
606+
// Hangul Decomposition.
607+
// See https://www.unicode.org/versions/latest/core-spec/chapter-3/#G56669
589608
if (SBase <= code && code < (SBase+SCount)) {
590609
int SIndex = code - SBase;
591610
int L = LBase + SIndex / NCount;

0 commit comments

Comments
 (0)