@@ -43,7 +43,7 @@ mod unicodedata {
4343 } ;
4444 use itertools:: Itertools ;
4545 use rustpython_common:: wtf8:: { CodePoint , Wtf8Buf } ;
46- use ucd:: { Codepoint , EastAsianWidth } ;
46+ use ucd:: { Codepoint , DecompositionType , EastAsianWidth , Number , NumericType } ;
4747 use unic_char_property:: EnumeratedCharProperty ;
4848 use unic_normal:: StrNormalForm ;
4949 use unic_ucd_age:: { Age , UNICODE_VERSION , UnicodeVersion } ;
@@ -62,9 +62,15 @@ mod unicodedata {
6262 "lookup" ,
6363 "name" ,
6464 "bidirectional" ,
65+ "combining" ,
66+ "decimal" ,
67+ "decomposition" ,
68+ "digit" ,
6569 "east_asian_width" ,
66- "normalize " ,
70+ "is_normalized " ,
6771 "mirrored" ,
72+ "normalize" ,
73+ "numeric" ,
6874 ] {
6975 module. set_attr ( attr, ucd. get_attr ( attr, vm) ?, vm) ?;
7076 }
@@ -125,7 +131,11 @@ mod unicodedata {
125131 {
126132 return Ok ( character. to_string ( ) ) ;
127133 }
128- Err ( vm. new_lookup_error ( format ! ( "undefined character name '{name}'" ) ) )
134+ Err ( vm. new_key_error (
135+ vm. ctx
136+ . new_str ( format ! ( "undefined character name '{name}'" ) )
137+ . into ( ) ,
138+ ) )
129139 }
130140
131141 #[ pymethod]
@@ -189,6 +199,19 @@ mod unicodedata {
189199 Ok ( normalized_text)
190200 }
191201
202+ #[ pymethod]
203+ fn is_normalized ( & self , form : super :: NormalizeForm , unistr : PyStrRef ) -> PyResult < bool > {
204+ use super :: NormalizeForm :: * ;
205+ let text = unistr. as_wtf8 ( ) ;
206+ let normalized: Wtf8Buf = match form {
207+ Nfc => text. map_utf8 ( |s| s. nfc ( ) ) . collect ( ) ,
208+ Nfkc => text. map_utf8 ( |s| s. nfkc ( ) ) . collect ( ) ,
209+ Nfd => text. map_utf8 ( |s| s. nfd ( ) ) . collect ( ) ,
210+ Nfkd => text. map_utf8 ( |s| s. nfkd ( ) ) . collect ( ) ,
211+ } ;
212+ Ok ( text == & * normalized)
213+ }
214+
192215 #[ pymethod]
193216 fn mirrored ( & self , character : PyStrRef , vm : & VirtualMachine ) -> PyResult < i32 > {
194217 match self . extract_char ( character, vm) ? {
@@ -204,12 +227,123 @@ mod unicodedata {
204227 }
205228 }
206229
230+ #[ pymethod]
231+ fn combining ( & self , character : PyStrRef , vm : & VirtualMachine ) -> PyResult < i32 > {
232+ Ok ( self
233+ . extract_char ( character, vm) ?
234+ . and_then ( |c| c. to_char ( ) )
235+ . map_or ( 0 , |ch| ch. canonical_combining_class ( ) as i32 ) )
236+ }
237+
238+ #[ pymethod]
239+ fn decomposition ( & self , character : PyStrRef , vm : & VirtualMachine ) -> PyResult < String > {
240+ let ch = match self . extract_char ( character, vm) ?. and_then ( |c| c. to_char ( ) ) {
241+ Some ( ch) => ch,
242+ None => return Ok ( String :: new ( ) ) ,
243+ } ;
244+ let chars: Vec < char > = ch. decomposition_map ( ) . collect ( ) ;
245+ // If decomposition maps to just the character itself, there's no decomposition
246+ if chars. len ( ) == 1 && chars[ 0 ] == ch {
247+ return Ok ( String :: new ( ) ) ;
248+ }
249+ let hex_parts = chars
250+ . iter ( )
251+ . map ( |c| format ! ( "{:04X}" , * c as u32 ) )
252+ . join ( " " ) ;
253+ let tag = match ch. decomposition_type ( ) {
254+ Some ( DecompositionType :: Canonical ) | None => return Ok ( hex_parts) ,
255+ Some ( dt) => decomposition_type_tag ( dt) ,
256+ } ;
257+ Ok ( format ! ( "<{tag}> {hex_parts}" ) )
258+ }
259+
260+ #[ pymethod]
261+ fn digit (
262+ & self ,
263+ character : PyStrRef ,
264+ default : OptionalArg < PyObjectRef > ,
265+ vm : & VirtualMachine ,
266+ ) -> PyResult {
267+ let ch = self . extract_char ( character, vm) ?. and_then ( |c| c. to_char ( ) ) ;
268+ if let Some ( ch) = ch
269+ && matches ! (
270+ ch. numeric_type( ) ,
271+ Some ( NumericType :: Decimal ) | Some ( NumericType :: Digit )
272+ )
273+ && let Some ( Number :: Integer ( n) ) = ch. numeric_value ( )
274+ {
275+ return Ok ( vm. ctx . new_int ( n) . into ( ) ) ;
276+ }
277+ default. ok_or_else ( || vm. new_value_error ( "not a digit" ) )
278+ }
279+
280+ #[ pymethod]
281+ fn decimal (
282+ & self ,
283+ character : PyStrRef ,
284+ default : OptionalArg < PyObjectRef > ,
285+ vm : & VirtualMachine ,
286+ ) -> PyResult {
287+ let ch = self . extract_char ( character, vm) ?. and_then ( |c| c. to_char ( ) ) ;
288+ if let Some ( ch) = ch
289+ && ch. numeric_type ( ) == Some ( NumericType :: Decimal )
290+ && let Some ( Number :: Integer ( n) ) = ch. numeric_value ( )
291+ {
292+ return Ok ( vm. ctx . new_int ( n) . into ( ) ) ;
293+ }
294+ default. ok_or_else ( || vm. new_value_error ( "not a decimal" ) )
295+ }
296+
297+ #[ pymethod]
298+ fn numeric (
299+ & self ,
300+ character : PyStrRef ,
301+ default : OptionalArg < PyObjectRef > ,
302+ vm : & VirtualMachine ,
303+ ) -> PyResult {
304+ let ch = self . extract_char ( character, vm) ?. and_then ( |c| c. to_char ( ) ) ;
305+ if let Some ( ch) = ch {
306+ match ch. numeric_value ( ) {
307+ Some ( Number :: Integer ( n) ) => {
308+ return Ok ( vm. ctx . new_float ( n as f64 ) . into ( ) ) ;
309+ }
310+ Some ( Number :: Rational ( num, den) ) => {
311+ return Ok ( vm. ctx . new_float ( num as f64 / den as f64 ) . into ( ) ) ;
312+ }
313+ None => { }
314+ }
315+ }
316+ default. ok_or_else ( || vm. new_value_error ( "not a numeric character" ) )
317+ }
318+
207319 #[ pygetset]
208320 fn unidata_version ( & self ) -> String {
209321 self . unic_version . to_string ( )
210322 }
211323 }
212324
325+ fn decomposition_type_tag ( dt : DecompositionType ) -> & ' static str {
326+ match dt {
327+ DecompositionType :: Canonical => "canonical" ,
328+ DecompositionType :: Compat => "compat" ,
329+ DecompositionType :: Circle => "circle" ,
330+ DecompositionType :: Final => "final" ,
331+ DecompositionType :: Font => "font" ,
332+ DecompositionType :: Fraction => "fraction" ,
333+ DecompositionType :: Initial => "initial" ,
334+ DecompositionType :: Isolated => "isolated" ,
335+ DecompositionType :: Medial => "medial" ,
336+ DecompositionType :: Narrow => "narrow" ,
337+ DecompositionType :: Nobreak => "noBreak" ,
338+ DecompositionType :: Small => "small" ,
339+ DecompositionType :: Square => "square" ,
340+ DecompositionType :: Sub => "sub" ,
341+ DecompositionType :: Super => "super" ,
342+ DecompositionType :: Vertical => "vertical" ,
343+ DecompositionType :: Wide => "wide" ,
344+ }
345+ }
346+
213347 trait EastAsianWidthAbbrName {
214348 fn abbr_name ( & self ) -> & ' static str ;
215349 }
0 commit comments