Skip to content

Commit 8eb6fb0

Browse files
gh-95555: Allow a negated property as a character set member (GH-152245)
A negated multi-range property such as \P{ASCII} or \P{Pattern_Syntax} was rejected inside a character class. Such members are now alternated in with the other members: [\P{ASCII}abc] becomes [abc] | [^ASCII], and [\P{ASCII}] alone is just the negated charset. Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
1 parent d91e103 commit 8eb6fb0

2 files changed

Lines changed: 34 additions & 20 deletions

File tree

Lib/re/_parser.py

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -310,7 +310,7 @@ def checkgroupname(self, name, offset):
310310
msg = "bad character in group name %r" % name
311311
raise self.error(msg, len(name) + offset)
312312

313-
def _property_escape(source, escape, in_set=False):
313+
def _property_escape(source, escape):
314314
# handle \p{...} and \P{...} (UTS #18 1.2.4, "Property Syntax")
315315
from . import _properties
316316
if not source.match('{'):
@@ -320,10 +320,6 @@ def _property_escape(source, escape, in_set=False):
320320
if code is None:
321321
raise source.error("unknown property name %r" % name,
322322
len(name) + len(r'\p{}'))
323-
if in_set and code[1][0] == (NEGATE, None):
324-
# A negated multi-range property cannot be a member of a set.
325-
raise source.error("bad escape %s in character class" % escape,
326-
len(name) + len(r'\p{}'))
327323
return code
328324

329325
def _class_escape(source, escape):
@@ -369,7 +365,7 @@ def _class_escape(source, escape):
369365
len(charname) + len(r'\N{}')) from None
370366
return LITERAL, c
371367
elif c in "pP" and source.istext:
372-
return _property_escape(source, escape, in_set=True)
368+
return _property_escape(source, escape)
373369
elif c in OCTDIGITS:
374370
# octal escape (up to three digits)
375371
escape += source.getwhile(2, OCTDIGITS)
@@ -574,11 +570,15 @@ def _difference(left, right, state):
574570
# with the next operand.
575571
_SETOPS = {'||': _union, '&&': _intersect, '--': _difference}
576572

577-
def _operand_elements(set, compound):
578-
# The operand's elements: a standalone nested set, else the member union.
573+
def _operand_elements(set, compound, negated, state):
574+
# The operand's elements: a standalone nested set, else the member union,
575+
# with any negated-property members alternated in (see addmember).
579576
if compound is not None:
580577
return compound
581-
return [_charset_node(_uniq(set))]
578+
result = [_charset_node(_uniq(set))] if set or not negated else None
579+
for neg in negated:
580+
result = [neg] if result is None else _union(result, [neg], state)
581+
return result
582582

583583
def _parse_operand(source, state, nested, here, allow_nested):
584584
# Read one operand, stopping at a set operator or the closing ']'. An
@@ -591,10 +591,15 @@ def _parse_operand(source, state, nested, here, allow_nested):
591591
sourcematch = source.match
592592
set = []
593593
setappend = set.append
594+
negated = [] # \P{...} negated-range props, alternated in at the end
594595
def addmember(code):
595-
# Flatten a \p{...} property's IN into the member set.
596+
# Flatten a \p{...} property's IN into the member set; a negated one is a
597+
# complemented charset, set aside to _union in (it can't join the union).
596598
if code[0] is IN:
597-
set.extend(code[1])
599+
if code[1][0][0] is NEGATE:
600+
negated.append(code)
601+
else:
602+
set.extend(code[1])
598603
else:
599604
setappend(code)
600605
compound = None # elements of a standalone nested-set operand
@@ -607,9 +612,9 @@ def addmember(code):
607612
if this is None:
608613
raise source.error("unterminated character set",
609614
source.tell() - here)
610-
if set or compound is not None:
615+
if set or compound is not None or negated:
611616
if this == "]":
612-
return _operand_elements(set, compound), None
617+
return _operand_elements(set, compound, negated, state), None
613618
if this in '-&|~' and source.next == this:
614619
if this == '~':
615620
import warnings
@@ -621,7 +626,7 @@ def addmember(code):
621626
else:
622627
# '--', '&&' or '||' ends this operand and starts the next.
623628
sourceget() # consume the second operator character
624-
return _operand_elements(set, compound), this + this
629+
return _operand_elements(set, compound, negated, state), this + this
625630
if this[0] == "\\":
626631
code1 = _class_escape(source, this)
627632
else:
@@ -641,12 +646,12 @@ def addmember(code):
641646
# A trailing '-' is a literal.
642647
addmember(code1)
643648
setappend((LITERAL, _ord("-")))
644-
return [_charset_node(_uniq(set))], None
649+
return _operand_elements(set, None, negated, state), None
645650
if that == "-":
646651
# 'X--': difference, not a range. '--' after a single member
647652
# lands here because the range probe consumed the first '-'.
648653
addmember(code1)
649-
return [_charset_node(_uniq(set))], "--"
654+
return _operand_elements(set, None, negated, state), "--"
650655
if that[0] == "\\":
651656
code2 = _class_escape(source, that)
652657
else:

Lib/test/test_re.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1061,6 +1061,19 @@ def test_property_escapes(self):
10611061
self.assertIsNone(re.fullmatch(r'\p{ASCII_Hex_Digit}', '0'))
10621062
self.assertIsNone(re.fullmatch(r'\p{Hex_Digit}', 'g'))
10631063

1064+
# A negated multi-range property (not backed by an engine category) can
1065+
# be a set member; it is alternated in with the other members.
1066+
self.assertIsNone(re.fullmatch(r'[\P{ASCII}]', 'a'))
1067+
self.assertTrue(re.fullmatch(r'[\P{ASCII}]', 'ä'))
1068+
self.assertTrue(re.fullmatch(r'[\P{ASCII}abc]+', 'abäc日'))
1069+
self.assertIsNone(re.fullmatch(r'[\P{ASCII}abc]', 'd'))
1070+
self.assertTrue(re.fullmatch(r'[abc\P{ASCII}]+', 'abäc日'))
1071+
self.assertTrue(re.fullmatch(r'[^\P{ASCII}]+', 'AZ09~')) # = ASCII
1072+
self.assertIsNone(re.fullmatch(r'[^\P{ASCII}]', 'ä'))
1073+
# Composes with set operations.
1074+
self.assertTrue(re.fullmatch(r'[\w--\P{ASCII}]+', 'AZ09_')) # \w and ASCII
1075+
self.assertIsNone(re.fullmatch(r'[\w--\P{ASCII}]', 'д'))
1076+
10641077
# Errors.
10651078
self.checkPatternError(r'\p', 'missing {, expected property name', 2)
10661079
self.checkPatternError(r'[\p]', 'missing {, expected property name', 3)
@@ -1072,10 +1085,6 @@ def test_property_escapes(self):
10721085
# \p is not special in bytes patterns.
10731086
self.checkPatternError(br'\p{Lu}', r'bad escape \p', 0)
10741087
self.checkPatternError(br'\P{Lu}', r'bad escape \P', 0)
1075-
# A negated multi-range property (one not backed by an engine
1076-
# category) cannot be a set member.
1077-
self.checkPatternError(r'[\P{ASCII}]',
1078-
r'bad escape \P in character class', 1)
10791088

10801089
def test_word_boundaries(self):
10811090
# See http://bugs.python.org/issue10713

0 commit comments

Comments
 (0)