Skip to content

Commit 3349360

Browse files
committed
Implement missing unicodedata functions and fix lookup error type
Add combining, decomposition, digit, decimal, numeric methods to Ucd. Change lookup() to raise KeyError instead of LookupError. Remove expectedFailure markers from 9 passing tests. Add unicodedata.is_normalized() method. Rename decomp_chars to chars to fix spell check. Remove expectedFailure from test_named_unicode_escapes and test_urlsplit_normalization.
1 parent 9ec6d6c commit 3349360

File tree

5 files changed

+137
-14
lines changed

5 files changed

+137
-14
lines changed

Lib/test/test_re.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -851,7 +851,6 @@ def test_other_escapes(self):
851851
with self.subTest(c):
852852
self.assertRaises(re.PatternError, re.compile, '[\\%c]' % c)
853853

854-
@unittest.expectedFailure # TODO: RUSTPYTHON
855854
def test_named_unicode_escapes(self):
856855
# test individual Unicode named escapes
857856
self.assertTrue(re.match(r'\N{LESS-THAN SIGN}', '<'))

Lib/test/test_ucn.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,6 @@ def check_version(testfile):
203203
with self.assertRaises(KeyError):
204204
unicodedata.ucd_3_2_0.lookup(seqname)
205205

206-
@unittest.expectedFailure # TODO: RUSTPYTHON
207206
def test_errors(self):
208207
self.assertRaises(TypeError, unicodedata.name)
209208
self.assertRaises(TypeError, unicodedata.name, 'xx')

Lib/test/test_unicodedata.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,6 @@ def test_no_names_in_pua(self):
120120
char = chr(i)
121121
self.assertRaises(ValueError, self.db.name, char)
122122

123-
@unittest.expectedFailure # TODO: RUSTPYTHON; LookupError: undefined character name 'LATIN SMLL LETR A'
124123
def test_lookup_nonexistant(self):
125124
# just make sure that lookup can fail
126125
for nonexistent in [
@@ -133,7 +132,6 @@ def test_lookup_nonexistant(self):
133132
]:
134133
self.assertRaises(KeyError, self.db.lookup, nonexistent)
135134

136-
@unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: module 'unicodedata' has no attribute 'digit'
137135
def test_digit(self):
138136
self.assertEqual(self.db.digit('A', None), None)
139137
self.assertEqual(self.db.digit('9'), 9)
@@ -146,7 +144,6 @@ def test_digit(self):
146144
self.assertRaises(TypeError, self.db.digit, 'xx')
147145
self.assertRaises(ValueError, self.db.digit, 'x')
148146

149-
@unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: module 'unicodedata' has no attribute 'numeric'
150147
def test_numeric(self):
151148
self.assertEqual(self.db.numeric('A',None), None)
152149
self.assertEqual(self.db.numeric('9'), 9)
@@ -160,7 +157,6 @@ def test_numeric(self):
160157
self.assertRaises(TypeError, self.db.numeric, 'xx')
161158
self.assertRaises(ValueError, self.db.numeric, 'x')
162159

163-
@unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: module 'unicodedata' has no attribute 'decimal'
164160
def test_decimal(self):
165161
self.assertEqual(self.db.decimal('A',None), None)
166162
self.assertEqual(self.db.decimal('9'), 9)
@@ -193,7 +189,6 @@ def test_bidirectional(self):
193189
self.assertRaises(TypeError, self.db.bidirectional)
194190
self.assertRaises(TypeError, self.db.bidirectional, 'xx')
195191

196-
@unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: module 'unicodedata' has no attribute 'decomposition'
197192
def test_decomposition(self):
198193
self.assertEqual(self.db.decomposition('\uFFFE'),'')
199194
self.assertEqual(self.db.decomposition('\u00bc'), '<fraction> 0031 2044 0034')
@@ -210,7 +205,6 @@ def test_mirrored(self):
210205
self.assertRaises(TypeError, self.db.mirrored)
211206
self.assertRaises(TypeError, self.db.mirrored, 'xx')
212207

213-
@unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: module 'unicodedata' has no attribute 'combining'
214208
def test_combining(self):
215209
self.assertEqual(self.db.combining('\uFFFE'), 0)
216210
self.assertEqual(self.db.combining('a'), 0)
@@ -313,7 +307,6 @@ def test_failed_import_during_compiling(self):
313307
"(can't load unicodedata module)"
314308
self.assertIn(error, result.err.decode("ascii"))
315309

316-
@unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: module 'unicodedata' has no attribute 'decimal'
317310
def test_decimal_numeric_consistent(self):
318311
# Test that decimal and numeric are consistent,
319312
# i.e. if a character has a decimal value,
@@ -327,7 +320,6 @@ def test_decimal_numeric_consistent(self):
327320
count += 1
328321
self.assertTrue(count >= 10) # should have tested at least the ASCII digits
329322

330-
@unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: module 'unicodedata' has no attribute 'digit'
331323
def test_digit_numeric_consistent(self):
332324
# Test that digit and numeric are consistent,
333325
# i.e. if a character has a digit value,

Lib/test/test_urlparse.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1495,7 +1495,6 @@ def test_all(self):
14951495
expected.append(name)
14961496
self.assertCountEqual(urllib.parse.__all__, expected)
14971497

1498-
@unittest.expectedFailure # TODO: RUSTPYTHON
14991498
def test_urlsplit_normalization(self):
15001499
# Certain characters should never occur in the netloc,
15011500
# including under normalization.

crates/stdlib/src/unicodedata.rs

Lines changed: 137 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ mod unicodedata {
4343
};
4444
use itertools::Itertools;
4545
use rustpython_common::wtf8::{CodePoint, Wtf8Buf};
46-
use ucd::{Codepoint, EastAsianWidth};
46+
use ucd::{Codepoint, DecompositionType, EastAsianWidth, Number, NumericType};
4747
use unic_char_property::EnumeratedCharProperty;
4848
use unic_normal::StrNormalForm;
4949
use unic_ucd_age::{Age, UNICODE_VERSION, UnicodeVersion};
@@ -62,9 +62,15 @@ mod unicodedata {
6262
"lookup",
6363
"name",
6464
"bidirectional",
65+
"combining",
66+
"decimal",
67+
"decomposition",
68+
"digit",
6569
"east_asian_width",
66-
"normalize",
70+
"is_normalized",
6771
"mirrored",
72+
"normalize",
73+
"numeric",
6874
] {
6975
module.set_attr(attr, ucd.get_attr(attr, vm)?, vm)?;
7076
}
@@ -125,7 +131,11 @@ mod unicodedata {
125131
{
126132
return Ok(character.to_string());
127133
}
128-
Err(vm.new_lookup_error(format!("undefined character name '{name}'")))
134+
Err(vm.new_key_error(
135+
vm.ctx
136+
.new_str(format!("undefined character name '{name}'"))
137+
.into(),
138+
))
129139
}
130140

131141
#[pymethod]
@@ -189,6 +199,19 @@ mod unicodedata {
189199
Ok(normalized_text)
190200
}
191201

202+
#[pymethod]
203+
fn is_normalized(&self, form: super::NormalizeForm, unistr: PyStrRef) -> PyResult<bool> {
204+
use super::NormalizeForm::*;
205+
let text = unistr.as_wtf8();
206+
let normalized: Wtf8Buf = match form {
207+
Nfc => text.map_utf8(|s| s.nfc()).collect(),
208+
Nfkc => text.map_utf8(|s| s.nfkc()).collect(),
209+
Nfd => text.map_utf8(|s| s.nfd()).collect(),
210+
Nfkd => text.map_utf8(|s| s.nfkd()).collect(),
211+
};
212+
Ok(text == &*normalized)
213+
}
214+
192215
#[pymethod]
193216
fn mirrored(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<i32> {
194217
match self.extract_char(character, vm)? {
@@ -204,12 +227,123 @@ mod unicodedata {
204227
}
205228
}
206229

230+
#[pymethod]
231+
fn combining(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<i32> {
232+
Ok(self
233+
.extract_char(character, vm)?
234+
.and_then(|c| c.to_char())
235+
.map_or(0, |ch| ch.canonical_combining_class() as i32))
236+
}
237+
238+
#[pymethod]
239+
fn decomposition(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult<String> {
240+
let ch = match self.extract_char(character, vm)?.and_then(|c| c.to_char()) {
241+
Some(ch) => ch,
242+
None => return Ok(String::new()),
243+
};
244+
let chars: Vec<char> = ch.decomposition_map().collect();
245+
// If decomposition maps to just the character itself, there's no decomposition
246+
if chars.len() == 1 && chars[0] == ch {
247+
return Ok(String::new());
248+
}
249+
let hex_parts = chars
250+
.iter()
251+
.map(|c| format!("{:04X}", *c as u32))
252+
.join(" ");
253+
let tag = match ch.decomposition_type() {
254+
Some(DecompositionType::Canonical) | None => return Ok(hex_parts),
255+
Some(dt) => decomposition_type_tag(dt),
256+
};
257+
Ok(format!("<{tag}> {hex_parts}"))
258+
}
259+
260+
#[pymethod]
261+
fn digit(
262+
&self,
263+
character: PyStrRef,
264+
default: OptionalArg<PyObjectRef>,
265+
vm: &VirtualMachine,
266+
) -> PyResult {
267+
let ch = self.extract_char(character, vm)?.and_then(|c| c.to_char());
268+
if let Some(ch) = ch
269+
&& matches!(
270+
ch.numeric_type(),
271+
Some(NumericType::Decimal) | Some(NumericType::Digit)
272+
)
273+
&& let Some(Number::Integer(n)) = ch.numeric_value()
274+
{
275+
return Ok(vm.ctx.new_int(n).into());
276+
}
277+
default.ok_or_else(|| vm.new_value_error("not a digit"))
278+
}
279+
280+
#[pymethod]
281+
fn decimal(
282+
&self,
283+
character: PyStrRef,
284+
default: OptionalArg<PyObjectRef>,
285+
vm: &VirtualMachine,
286+
) -> PyResult {
287+
let ch = self.extract_char(character, vm)?.and_then(|c| c.to_char());
288+
if let Some(ch) = ch
289+
&& ch.numeric_type() == Some(NumericType::Decimal)
290+
&& let Some(Number::Integer(n)) = ch.numeric_value()
291+
{
292+
return Ok(vm.ctx.new_int(n).into());
293+
}
294+
default.ok_or_else(|| vm.new_value_error("not a decimal"))
295+
}
296+
297+
#[pymethod]
298+
fn numeric(
299+
&self,
300+
character: PyStrRef,
301+
default: OptionalArg<PyObjectRef>,
302+
vm: &VirtualMachine,
303+
) -> PyResult {
304+
let ch = self.extract_char(character, vm)?.and_then(|c| c.to_char());
305+
if let Some(ch) = ch {
306+
match ch.numeric_value() {
307+
Some(Number::Integer(n)) => {
308+
return Ok(vm.ctx.new_float(n as f64).into());
309+
}
310+
Some(Number::Rational(num, den)) => {
311+
return Ok(vm.ctx.new_float(num as f64 / den as f64).into());
312+
}
313+
None => {}
314+
}
315+
}
316+
default.ok_or_else(|| vm.new_value_error("not a numeric character"))
317+
}
318+
207319
#[pygetset]
208320
fn unidata_version(&self) -> String {
209321
self.unic_version.to_string()
210322
}
211323
}
212324

325+
fn decomposition_type_tag(dt: DecompositionType) -> &'static str {
326+
match dt {
327+
DecompositionType::Canonical => "canonical",
328+
DecompositionType::Compat => "compat",
329+
DecompositionType::Circle => "circle",
330+
DecompositionType::Final => "final",
331+
DecompositionType::Font => "font",
332+
DecompositionType::Fraction => "fraction",
333+
DecompositionType::Initial => "initial",
334+
DecompositionType::Isolated => "isolated",
335+
DecompositionType::Medial => "medial",
336+
DecompositionType::Narrow => "narrow",
337+
DecompositionType::Nobreak => "noBreak",
338+
DecompositionType::Small => "small",
339+
DecompositionType::Square => "square",
340+
DecompositionType::Sub => "sub",
341+
DecompositionType::Super => "super",
342+
DecompositionType::Vertical => "vertical",
343+
DecompositionType::Wide => "wide",
344+
}
345+
}
346+
213347
trait EastAsianWidthAbbrName {
214348
fn abbr_name(&self) -> &'static str;
215349
}

0 commit comments

Comments
 (0)