Implement missing unicodedata functions and fix lookup error type

youknowone · youknowone · commit 334936045dd4 · 2026-02-14T09:32:17.000+09:00
Add combining, decomposition, digit, decimal, numeric methods to Ucd.
Change lookup() to raise KeyError instead of LookupError.
Remove expectedFailure markers from 9 passing tests.
Add unicodedata.is_normalized() method.
Rename decomp_chars to chars to fix spell check.
Remove expectedFailure from test_named_unicode_escapes and
test_urlsplit_normalization.
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
@@ -851,7 +851,6 @@ def test_other_escapes(self):
             with self.subTest(c):
                 self.assertRaises(re.PatternError, re.compile, '[\\%c]' % c)
 
-    @unittest.expectedFailure # TODO: RUSTPYTHON
     def test_named_unicode_escapes(self):
         # test individual Unicode named escapes
         self.assertTrue(re.match(r'\N{LESS-THAN SIGN}', '<'))
diff --git a/Lib/test/test_ucn.py b/Lib/test/test_ucn.py
@@ -203,7 +203,6 @@ def check_version(testfile):
                 with self.assertRaises(KeyError):
                     unicodedata.ucd_3_2_0.lookup(seqname)
 
-    @unittest.expectedFailure # TODO: RUSTPYTHON
     def test_errors(self):
         self.assertRaises(TypeError, unicodedata.name)
         self.assertRaises(TypeError, unicodedata.name, 'xx')
diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py
@@ -120,7 +120,6 @@ def test_no_names_in_pua(self):
             char = chr(i)
             self.assertRaises(ValueError, self.db.name, char)
 
-    @unittest.expectedFailure  # TODO: RUSTPYTHON; LookupError: undefined character name 'LATIN SMLL LETR A'
     def test_lookup_nonexistant(self):
         # just make sure that lookup can fail
         for nonexistent in [
@@ -133,7 +132,6 @@ def test_lookup_nonexistant(self):
         ]:
             self.assertRaises(KeyError, self.db.lookup, nonexistent)
 
-    @unittest.expectedFailure  # TODO: RUSTPYTHON; AttributeError: module 'unicodedata' has no attribute 'digit'
     def test_digit(self):
         self.assertEqual(self.db.digit('A', None), None)
         self.assertEqual(self.db.digit('9'), 9)
@@ -146,7 +144,6 @@ def test_digit(self):
         self.assertRaises(TypeError, self.db.digit, 'xx')
         self.assertRaises(ValueError, self.db.digit, 'x')
 
-    @unittest.expectedFailure  # TODO: RUSTPYTHON; AttributeError: module 'unicodedata' has no attribute 'numeric'
     def test_numeric(self):
         self.assertEqual(self.db.numeric('A',None), None)
         self.assertEqual(self.db.numeric('9'), 9)
@@ -160,7 +157,6 @@ def test_numeric(self):
         self.assertRaises(TypeError, self.db.numeric, 'xx')
         self.assertRaises(ValueError, self.db.numeric, 'x')
 
-    @unittest.expectedFailure  # TODO: RUSTPYTHON; AttributeError: module 'unicodedata' has no attribute 'decimal'
     def test_decimal(self):
         self.assertEqual(self.db.decimal('A',None), None)
         self.assertEqual(self.db.decimal('9'), 9)
@@ -193,7 +189,6 @@ def test_bidirectional(self):
         self.assertRaises(TypeError, self.db.bidirectional)
         self.assertRaises(TypeError, self.db.bidirectional, 'xx')
 
-    @unittest.expectedFailure  # TODO: RUSTPYTHON; AttributeError: module 'unicodedata' has no attribute 'decomposition'
     def test_decomposition(self):
         self.assertEqual(self.db.decomposition('\uFFFE'),'')
         self.assertEqual(self.db.decomposition('\u00bc'), '<fraction> 0031 2044 0034')
@@ -210,7 +205,6 @@ def test_mirrored(self):
         self.assertRaises(TypeError, self.db.mirrored)
         self.assertRaises(TypeError, self.db.mirrored, 'xx')
 
-    @unittest.expectedFailure  # TODO: RUSTPYTHON; AttributeError: module 'unicodedata' has no attribute 'combining'
     def test_combining(self):
         self.assertEqual(self.db.combining('\uFFFE'), 0)
         self.assertEqual(self.db.combining('a'), 0)
@@ -313,7 +307,6 @@ def test_failed_import_during_compiling(self):
             "(can't load unicodedata module)"
         self.assertIn(error, result.err.decode("ascii"))
 
-    @unittest.expectedFailure  # TODO: RUSTPYTHON; AttributeError: module 'unicodedata' has no attribute 'decimal'
     def test_decimal_numeric_consistent(self):
         # Test that decimal and numeric are consistent,
         # i.e. if a character has a decimal value,
@@ -327,7 +320,6 @@ def test_decimal_numeric_consistent(self):
                 count += 1
         self.assertTrue(count >= 10) # should have tested at least the ASCII digits
 
-    @unittest.expectedFailure  # TODO: RUSTPYTHON; AttributeError: module 'unicodedata' has no attribute 'digit'
     def test_digit_numeric_consistent(self):
         # Test that digit and numeric are consistent,
         # i.e. if a character has a digit value,
diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py
@@ -1495,7 +1495,6 @@ def test_all(self):
                 expected.append(name)
         self.assertCountEqual(urllib.parse.__all__, expected)
 
-    @unittest.expectedFailure  # TODO: RUSTPYTHON
     def test_urlsplit_normalization(self):
         # Certain characters should never occur in the netloc,
         # including under normalization.
diff --git a/crates/stdlib/src/unicodedata.rs b/crates/stdlib/src/unicodedata.rs
@@ -43,7 +43,7 @@ mod unicodedata {
     };
     use itertools::Itertools;
     use rustpython_common::wtf8::{CodePoint, Wtf8Buf};
-    use ucd::{Codepoint, EastAsianWidth};
+    use ucd::{Codepoint, DecompositionType, EastAsianWidth, Number, NumericType};
     use unic_char_property::EnumeratedCharProperty;
     use unic_normal::StrNormalForm;
     use unic_ucd_age::{Age, UNICODE_VERSION, UnicodeVersion};
@@ -62,9 +62,15 @@ mod unicodedata {
             "lookup",
             "name",
             "bidirectional",
+            "combining",
+            "decimal",
+            "decomposition",
+            "digit",
             "east_asian_width",
-            "normalize",
+            "is_normalized",
             "mirrored",
+            "normalize",
+            "numeric",
         ] {
             module.set_attr(attr, ucd.get_attr(attr, vm)?, vm)?;
         }
@@ -125,7 +131,11 @@ mod unicodedata {
             {
                 return Ok(character.to_string());
             }
-            Err(vm.new_lookup_error(format!("undefined character name '{name}'")))
+            Err(vm.new_key_error(
+                vm.ctx
+                    .new_str(format!("undefined character name '{name}'"))
+                    .into(),
+            ))
         }
 
         #[pymethod]
@@ -189,6 +199,19 @@ mod unicodedata {
             Ok(normalized_text)
         }
 
+        #[pymethod]
+        fn is_normalized(&self, form: super::NormalizeForm, unistr: PyStrRef) -> PyResult<bool> {
+            use super::NormalizeForm::*;
+            let text = unistr.as_wtf8();
+            let normalized: Wtf8Buf = match form {
+                Nfc => text.map_utf8(|s| s.nfc()).collect(),
+                Nfkc => text.map_utf8(|s| s.nfkc()).collect(),
+                Nfd => text.map_utf8(|s| s.nfd()).collect(),
+                Nfkd => text.map_utf8(|s| s.nfkd()).collect(),
+            };
+            Ok(text == &*normalized)
+        }
+
         #[pymethod]
         fn mirrored(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<i32> {
             match self.extract_char(character, vm)? {
@@ -204,12 +227,123 @@ mod unicodedata {
             }
         }
 
+        #[pymethod]
+        fn combining(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<i32> {
+            Ok(self
+                .extract_char(character, vm)?
+                .and_then(|c| c.to_char())
+                .map_or(0, |ch| ch.canonical_combining_class() as i32))
+        }
+
+        #[pymethod]
+        fn decomposition(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<String> {
+            let ch = match self.extract_char(character, vm)?.and_then(|c| c.to_char()) {
+                Some(ch) => ch,
+                None => return Ok(String::new()),
+            };
+            let chars: Vec<char> = ch.decomposition_map().collect();
+            // If decomposition maps to just the character itself, there's no decomposition
+            if chars.len() == 1 && chars[0] == ch {
+                return Ok(String::new());
+            }
+            let hex_parts = chars
+                .iter()
+                .map(|c| format!("{:04X}", *c as u32))
+                .join(" ");
+            let tag = match ch.decomposition_type() {
+                Some(DecompositionType::Canonical) | None => return Ok(hex_parts),
+                Some(dt) => decomposition_type_tag(dt),
+            };
+            Ok(format!("<{tag}> {hex_parts}"))
+        }
+
+        #[pymethod]
+        fn digit(
+            &self,
+            character: PyStrRef,
+            default: OptionalArg<PyObjectRef>,
+            vm: &VirtualMachine,
+        ) -> PyResult {
+            let ch = self.extract_char(character, vm)?.and_then(|c| c.to_char());
+            if let Some(ch) = ch
+                && matches!(
+                    ch.numeric_type(),
+                    Some(NumericType::Decimal) | Some(NumericType::Digit)
+                )
+                && let Some(Number::Integer(n)) = ch.numeric_value()
+            {
+                return Ok(vm.ctx.new_int(n).into());
+            }
+            default.ok_or_else(|| vm.new_value_error("not a digit"))
+        }
+
+        #[pymethod]
+        fn decimal(
+            &self,
+            character: PyStrRef,
+            default: OptionalArg<PyObjectRef>,
+            vm: &VirtualMachine,
+        ) -> PyResult {
+            let ch = self.extract_char(character, vm)?.and_then(|c| c.to_char());
+            if let Some(ch) = ch
+                && ch.numeric_type() == Some(NumericType::Decimal)
+                && let Some(Number::Integer(n)) = ch.numeric_value()
+            {
+                return Ok(vm.ctx.new_int(n).into());
+            }
+            default.ok_or_else(|| vm.new_value_error("not a decimal"))
+        }
+
+        #[pymethod]
+        fn numeric(
+            &self,
+            character: PyStrRef,
+            default: OptionalArg<PyObjectRef>,
+            vm: &VirtualMachine,
+        ) -> PyResult {
+            let ch = self.extract_char(character, vm)?.and_then(|c| c.to_char());
+            if let Some(ch) = ch {
+                match ch.numeric_value() {
+                    Some(Number::Integer(n)) => {
+                        return Ok(vm.ctx.new_float(n as f64).into());
+                    }
+                    Some(Number::Rational(num, den)) => {
+                        return Ok(vm.ctx.new_float(num as f64 / den as f64).into());
+                    }
+                    None => {}
+                }
+            }
+            default.ok_or_else(|| vm.new_value_error("not a numeric character"))
+        }
+
         #[pygetset]
         fn unidata_version(&self) -> String {
             self.unic_version.to_string()
         }
     }
 
+    fn decomposition_type_tag(dt: DecompositionType) -> &'static str {
+        match dt {
+            DecompositionType::Canonical => "canonical",
+            DecompositionType::Compat => "compat",
+            DecompositionType::Circle => "circle",
+            DecompositionType::Final => "final",
+            DecompositionType::Font => "font",
+            DecompositionType::Fraction => "fraction",
+            DecompositionType::Initial => "initial",
+            DecompositionType::Isolated => "isolated",
+            DecompositionType::Medial => "medial",
+            DecompositionType::Narrow => "narrow",
+            DecompositionType::Nobreak => "noBreak",
+            DecompositionType::Small => "small",
+            DecompositionType::Square => "square",
+            DecompositionType::Sub => "sub",
+            DecompositionType::Super => "super",
+            DecompositionType::Vertical => "vertical",
+            DecompositionType::Wide => "wide",
+        }
+    }
+
     trait EastAsianWidthAbbrName {
         fn abbr_name(&self) -> &'static str;
     }