From 4c751f1a7cf3901f2e5470443766c93937266bed Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Fri, 7 Apr 2023 23:00:35 -0400 Subject: [PATCH 1/2] Parse quoted-string local parts but by default keep them disallowed with better exception messages People have opened issues several times about quoted local parts being incorrectly rejected. We can give a better error when it happens to head-off questions about it by parsing them so that we know when they occur. * Detect when a quoted-string local part might be present when splitting the address into a local part and domain part when the address has quoted @-signs in the local part rather than giving an error message about multiple @-signs. * Remove the surrounding quotes and un-escape the string before checking the syntax of the local part. Return the un-quoted and un-escaped string as the normalized local_part in the returned ValidatedEmail object if it's valid as an unquoted local part. * Check for invalid characters in the quoted-string (per the spec and our additional Unicode character checks) and raise exceptions. * Add a new option to accept quoted-string local parts which is off by default. When accepting them, apply Unicode normalization as per dot-atom internationalized addresses and apply minimal backslash escaping. * Update tests. --- CHANGELOG.md | 1 + README.md | 29 +++++--- email_validator/__init__.py | 3 +- email_validator/rfc_constants.py | 9 +++ email_validator/syntax.py | 113 ++++++++++++++++++++++++------ email_validator/validate_email.py | 45 +++++++++--- tests/test_syntax.py | 45 ++++++++---- 7 files changed, 189 insertions(+), 56 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bcd013e..ff57248 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ There are no significant changes to which email addresses are considered valid/i * The dnspython package is no longer required if DNS checks are not used, although it will install automatically. * NoNameservers and NXDOMAIN DNS errors are now handled differently: NoNameservers no longer fails validation, and NXDOMAIN now skips checking for an A/AAAA fallback and goes straight to failing validation. * Some syntax error messages have changed because they are now checked explicitly rather than as a part of other checks. +* The quoted-string local part syntax (e.g. multiple @-signs, spaces, etc. if surrounded by quotes) is now parsed but not considered valid by default. Better error messages are now given for quoted-string syntax since it can be confusing for a technically valid address to be rejected, and a new allow_quoted_local option is added to allow these addresses if you really need them. * Some other error messages have changed to not repeat the email address in the error message. * The library has been reorganized internally into smaller modules. * The tests have been reorganized and expanded. Deliverability tests now mostly use captured DNS responses so they can be run off-line. diff --git a/README.md b/README.md index e40db48..2871187 100644 --- a/README.md +++ b/README.md @@ -22,14 +22,14 @@ Key features: * Supports internationalized domain names and internationalized local parts. Blocks unsafe characters for your safety. * Normalizes email addresses (important for internationalized - addresses! see below). + and quoted-string addresses! see below). * Python type annotations are used. -This library does NOT permit obsolete forms of email addresses, so -if you need strict validation against the email specs exactly, use +This library does NOT permit obsolete forms of email addresses by default, +so if you need strict validation against the email specs exactly, use [pyIsEmail](https://github.com/michaelherold/pyIsEmail) or try [flanker](https://github.com/mailgun/flanker) if you are parsing the -To: line of an email. +"To:" line of an email. [![Build Status](https://github.com/JoshData/python-email-validator/actions/workflows/test_and_build.yaml/badge.svg)](https://github.com/JoshData/python-email-validator/actions/workflows/test_and_build.yaml) @@ -103,8 +103,8 @@ But when an email address is valid, an object is returned containing a normalized form of the email address (which you should use!) and other information. -The validator doesn't permit obsoleted forms of email addresses that no -one uses anymore even though they are still valid and deliverable, since +The validator doesn't, by default, permit obsoleted forms of email addresses +that no one uses anymore even though they are still valid and deliverable, since they will probably give you grief if you're using email for login. (See later in the document about that.) @@ -134,6 +134,8 @@ The `validate_email` function also accepts the following keyword arguments require the [SMTPUTF8](https://tools.ietf.org/html/rfc6531) extension. You can also set `email_validator.ALLOW_SMTPUTF8` to `False` to turn it off for all calls by default. +`allow_quoted_local=False`: Set to `True` to allow obscure and potentially problematic email addresses in which the part of the address before the @-sign contains spaces, @-signs, or other surprising characters when the local part is surrounded in quotes (so-called quoted-string local parts). In the object returned by `validate_email`, the normalized local part removes any unnecessary backslash-escaping and even removes the surrounding quotes if the address would be valid without them. You can also set `email_validator.ALLOW_QUOTED_LOCAL` to `True` to turn this on for all calls by default. + `allow_empty_local=False`: Set to `True` to allow an empty local part (i.e. `@example.com`), e.g. for validating Postfix aliases. @@ -288,6 +290,11 @@ and conversion from Punycode to Unicode characters. 3.1](https://tools.ietf.org/html/rfc6532#section-3.1) and [RFC 5895 (IDNA 2008) section 2](http://www.ietf.org/rfc/rfc5895.txt).) +Normalization is also applied to quoted-string local parts if you have +allowed them by the `allow_quoted_local` option. Unnecessary backslash +escaping is removed and even the surrounding quotes are removed if they +are unnecessary. + Examples -------- @@ -355,9 +362,9 @@ are: | Field | Value | | -----:|-------| -| `email` | The normalized form of the email address that you should put in your database. This merely combines the `local_part` and `domain` fields (see below). | +| `email` | The normalized form of the email address that you should put in your database. This combines the `local_part` and `domain` fields (see below). | | `ascii_email` | If set, an ASCII-only form of the email address by replacing the domain part with [IDNA](https://tools.ietf.org/html/rfc5891) [Punycode](https://www.rfc-editor.org/rfc/rfc3492.txt). This field will be present when an ASCII-only form of the email address exists (including if the email address is already ASCII). If the local part of the email address contains internationalized characters, `ascii_email` will be `None`. If set, it merely combines `ascii_local_part` and `ascii_domain`. | -| `local_part` | The local part of the given email address (before the @-sign) with Unicode NFC normalization applied. | +| `local_part` | The normalized local part of the given email address (before the @-sign). Normalization includes Unicode NFC normalization and removing unnecessary quoted-string quotes and backslashes. If `allow_quoted_local` is True and the surrounding quotes are necessary, the quotes _will_ be present in this field. | | `ascii_local_part` | If set, the local part, which is composed of ASCII characters only. | | `domain` | The canonical internationalized Unicode form of the domain part of the email address. If the returned string contains non-ASCII characters, either the [SMTPUTF8](https://tools.ietf.org/html/rfc6531) feature of your mail relay will be required to transmit the message or else the email address's domain part must be converted to IDNA ASCII first: Use `ascii_domain` field instead. | | `ascii_domain` | The [IDNA](https://tools.ietf.org/html/rfc5891) [Punycode](https://www.rfc-editor.org/rfc/rfc3492.txt)-encoded form of the domain part of the given email address, as it would be transmitted on the wire. | @@ -383,9 +390,9 @@ or likely to cause trouble: (except see the `test_environment` parameter above). * Obsolete email syntaxes are rejected: The "quoted string" form of the local part of the email address (RFC - 5321 4.1.2) is not permitted. - Quoted forms allow multiple @-signs, space characters, and other - troublesome conditions. The unusual [(comment) syntax](https://github.com/JoshData/python-email-validator/issues/77) + 5321 4.1.2) is not permitted unless `allow_quoted_local=True` is given + (see above). + The unusual ["(comment)" syntax](https://github.com/JoshData/python-email-validator/issues/77) is also rejected. The "literal" form for the domain part of an email address (an IP address in brackets) is rejected. Other obsolete and deprecated syntaxes are rejected. No one uses these forms anymore. diff --git a/email_validator/__init__.py b/email_validator/__init__.py index 9d5373e..aa0dc7c 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -25,9 +25,10 @@ def caching_resolver(*args, **kwargs): # Default values for keyword arguments. ALLOW_SMTPUTF8 = True +ALLOW_QUOTED_LOCAL = False +GLOBALLY_DELIVERABLE = True CHECK_DELIVERABILITY = True TEST_ENVIRONMENT = False -GLOBALLY_DELIVERABLE = True DEFAULT_TIMEOUT = 15 # secs # IANA Special Use Domain Names diff --git a/email_validator/rfc_constants.py b/email_validator/rfc_constants.py index 9584970..cfbde12 100644 --- a/email_validator/rfc_constants.py +++ b/email_validator/rfc_constants.py @@ -27,6 +27,15 @@ DOT_ATOM_TEXT_HOSTNAME = re.compile(HOSTNAME_LABEL + r'(?:\.' + HOSTNAME_LABEL + r')*\Z') DOMAIN_NAME_REGEX = re.compile(r"[A-Za-z]\Z") # all TLDs currently end with a letter +# Quoted-string local part (RFC 5321 4.1.2, internationalized by RFC 6531 section 3.3) +# The permitted characters in a quoted string are the characters in the range +# 32-126, except that quotes and (literal) backslashes can only appear when escaped +# by a backslash. When internationalized, UTF8 strings are also permitted except +# the ASCII characters that are not previously permitted (see above). +# QUOTED_LOCAL_PART_ADDR = re.compile(r"^\"((?:[\u0020-\u0021\u0023-\u005B\u005D-\u007E]|\\[\u0020-\u007E])*)\"@(.*)") +QUOTED_LOCAL_PART_ADDR = re.compile(r"^\"((?:[^\"\\]|\\.)*)\"@(.*)") +QTEXT_INTL = re.compile(r"[\u0020-\u007E\u0080-\U0010FFFF]") + # Length constants # RFC 3696 + errata 1003 + errata 1690 (https://www.rfc-editor.org/errata_search.php?rfc=3696&eid=1690) # explains the maximum length of an email address is 254 octets. diff --git a/email_validator/syntax.py b/email_validator/syntax.py index 1bd7f3c..8227620 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -1,10 +1,12 @@ from .exceptions_types import EmailSyntaxError from .rfc_constants import EMAIL_MAX_LENGTH, LOCAL_PART_MAX_LENGTH, DOMAIN_MAX_LENGTH, \ - DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT_RE, ATEXT_INTL_RE, ATEXT_HOSTNAME_INTL, DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX + DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT_RE, ATEXT_INTL_RE, ATEXT_HOSTNAME_INTL, QTEXT_INTL, \ + DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX import re import unicodedata import idna # implements IDNA 2008; Python's codec is only IDNA 2003 +from typing import Optional def get_length_reason(addr, utf8=False, limit=EMAIL_MAX_LENGTH): @@ -32,7 +34,8 @@ def safe_character_display(c): return unicodedata.name(c, h) -def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_empty_local: bool = False): +def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_empty_local: bool = False, + quoted_local_part: bool = False): """Validates the syntax of the local part of an email address.""" if len(local) == 0: @@ -61,24 +64,32 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp # Check the local part against the non-internationalized regular expression. # Most email addresses match this regex so it's probably fastest to check this first. # (RFC 2822 3.2.4) + # All local parts matching the dot-atom rule are also valid as a quoted string + # so if it was originally quoted (quoted_local_part is True) and this regex matches, + # it's ok. + # (RFC 5321 4.1.2). m = DOT_ATOM_TEXT.match(local) if m: - # It's valid. + # It's valid. And since it's just the permitted ASCII characters, + # it's normalized and safe. If the local part was originally quoted, + # the quoting was unnecessary and it'll be returned as normalized to + # non-quoted form. - # Return the local part unchanged and flag that SMTPUTF8 is not needed. + # Return the local part and flag that SMTPUTF8 is not needed. return { "local_part": local, "ascii_local_part": local, "smtputf8": False, } - # The local part failed the ASCII check. Try the extended character set + # The local part failed the basic dot-atom check. Try the extended character set # for internationalized addresses. It's the same pattern but with additional # characters permitted. + # RFC 6531 section 3.3. + valid: Optional[str] = None + requires_smtputf8 = False m = DOT_ATOM_TEXT_INTL.match(local) if m: - # It's valid. - # But international characters in the local part may not be permitted. if not allow_smtputf8: # Check for invalid characters against the non-internationalized @@ -95,15 +106,56 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp # Although the check above should always find something, fall back to this just in case. raise EmailSyntaxError("Internationalized characters before the @-sign are not supported.") - # RFC 6532 section 3.1 also says that Unicode NFC normalization should be applied, + # It's valid. + valid = "dot-atom" + requires_smtputf8 = True + + # There are no syntactic restrictions on quoted local parts, so if + # it was originally quoted, it is probably valid. More characters + # are allowed, like @-signs, spaces, and quotes, and there are no + # restrictions on the placement of dots, as in dot-atom local parts. + elif quoted_local_part: + # Check for invalid characters in a quoted string local part. + # (RFC 5321 4.1.2. RFC 5322 lists additional permitted *obsolete* + # characters which are *not* allowed here. RFC 6531 section 3.3 + # extends the range to UTF8 strings.) + bad_chars = set( + safe_character_display(c) + for c in local + if not QTEXT_INTL.match(c) + ) + if bad_chars: + raise EmailSyntaxError("The email address contains invalid characters in quotes before the @-sign: " + ", ".join(sorted(bad_chars)) + ".") + + # See if any characters are outside of the ASCII range. + bad_chars = set( + safe_character_display(c) + for c in local + if not (32 <= ord(c) <= 126) + ) + if bad_chars: + requires_smtputf8 = True + + # International characters in the local part may not be permitted. + if not allow_smtputf8: + raise EmailSyntaxError("Internationalized characters before the @-sign are not supported: " + ", ".join(sorted(bad_chars)) + ".") + + # It's valid. + valid = "quoted" + + # If the local part matches the internationalized dot-atom form or was quoted, + # perform normalization and additional checks for Unicode strings. + if valid: + # RFC 6532 section 3.1 says that Unicode NFC normalization should be applied, # so we'll return the normalized local part in the return value. local = unicodedata.normalize("NFC", local) # Check that the local part is a valid, safe, and sensible Unicode string. # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked - # by DOT_ATOM_TEXT_INTL. Other characters may be permitted by the email specs, but - # they may not be valid, safe, or sensible Unicode strings. - check_unsafe_chars(local) + # by DOT_ATOM_TEXT_INTL and QTEXT_INTL. Other characters may be permitted by the + # email specs, but they may not be valid, safe, or sensible Unicode strings. + # See the function for rationale. + check_unsafe_chars(local, allow_space=(valid == "quoted")) # Try encoding to UTF-8. Failure is possible with some characters like # surrogate code points, but those are checked above. Still, we don't @@ -113,15 +165,22 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp except ValueError: raise EmailSyntaxError("The email address contains an invalid character.") - # Flag that SMTPUTF8 will be required for deliverability. + # If this address passes only by the quoted string form, re-quote it + # and backslash-escape quotes and backslashes (removing any unnecessary + # escapes). Per RFC 5321 4.1.2, "all quoted forms MUST be treated as equivalent, + # and the sending system SHOULD transmit the form that uses the minimum quoting possible." + if valid == "quoted": + local = '"' + re.sub(r'(["\\])', r'\\\1', local) + '"' + return { "local_part": local, - "ascii_local_part": None, # no ASCII form is possible - "smtputf8": True, + "ascii_local_part": local if not requires_smtputf8 else None, + "smtputf8": requires_smtputf8, } - # It's not a valid local part either non-internationalized or internationalized. - # Let's find out why. + # It's not a valid local part. Let's find out why. + # (Since quoted local parts are all valid or handled above, these checks + # don't apply in those cases.) # Check for invalid characters. # (RFC 2822 Section 3.2.4 / RFC 5322 Section 3.2.3, plus RFC 6531 section 3.3) @@ -142,7 +201,7 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp raise EmailSyntaxError("The email address contains invalid characters before the @-sign.") -def check_unsafe_chars(s): +def check_unsafe_chars(s, allow_space=False): # Check for unsafe characters or characters that would make the string # invalid or non-sensible Unicode. bad_chars = set() @@ -158,11 +217,23 @@ def check_unsafe_chars(s): # sensible. if i == 0: bad_chars.add(c) + elif category == "Zs": + # Spaces outside of the ASCII range are not specifically disallowed in + # internationalized addresses as far as I can tell, but they violate + # the spirit of the non-internationalized specification that email + # addresses do not contain ASCII spaces when not quoted. Excluding + # ASCII spaces when not quoted is handled directly by the atom regex. + # + # In quoted-string local parts, spaces are explicitly permitted, and + # the ASCII space has category Zs, so we must allow it here, and we'll + # allow all Unicode spaces to be consistent. + if not allow_space: + bad_chars.add(c) elif category[0] == "Z": - # Spaces and line/paragraph characters (Z) outside of the ASCII range - # are not specifically disallowed as far as I can tell, but they - # violate the spirit of the non-internationalized specification that - # email addresses do not contain spaces or line breaks when not quoted. + # Line/paragraph characters (Zl and Zp) outside of the ASCII range + # are not specifically disallowed in internationalized addresses + # as far as I can tell, but they violate the spirit of the non-internationalized + # specification that email addresses do not contain line breaks when not quoted. bad_chars.add(c) elif category[0] == "C": # Control, format, surrogate, private use, and unassigned code points (C) diff --git a/email_validator/validate_email.py b/email_validator/validate_email.py index 8e05498..8e9343c 100644 --- a/email_validator/validate_email.py +++ b/email_validator/validate_email.py @@ -2,7 +2,7 @@ from .exceptions_types import EmailSyntaxError, ValidatedEmail from .syntax import validate_email_local_part, validate_email_domain_part, get_length_reason -from .rfc_constants import EMAIL_MAX_LENGTH +from .rfc_constants import EMAIL_MAX_LENGTH, QUOTED_LOCAL_PART_ADDR def validate_email( @@ -11,6 +11,7 @@ def validate_email( *, allow_smtputf8: Optional[bool] = None, allow_empty_local: bool = False, + allow_quoted_local: bool = False, check_deliverability: Optional[bool] = None, test_environment: Optional[bool] = None, globally_deliverable: Optional[bool] = None, @@ -24,9 +25,12 @@ def validate_email( """ # Fill in default values of arguments. - from . import ALLOW_SMTPUTF8, CHECK_DELIVERABILITY, TEST_ENVIRONMENT, GLOBALLY_DELIVERABLE, DEFAULT_TIMEOUT + from . import ALLOW_SMTPUTF8, ALLOW_QUOTED_LOCAL, GLOBALLY_DELIVERABLE, \ + CHECK_DELIVERABILITY, TEST_ENVIRONMENT, DEFAULT_TIMEOUT if allow_smtputf8 is None: allow_smtputf8 = ALLOW_SMTPUTF8 + if allow_quoted_local is None: + allow_quoted_local = ALLOW_QUOTED_LOCAL if check_deliverability is None: check_deliverability = CHECK_DELIVERABILITY if test_environment is None: @@ -45,25 +49,48 @@ def validate_email( except ValueError: raise EmailSyntaxError("The email address is not valid ASCII.") - # At-sign. - parts = email.split('@') - if len(parts) != 2: - raise EmailSyntaxError("The email address is not valid. It must have exactly one @-sign.") + # Typical email addresses have a single @-sign, but the + # awkward "quoted string" local part form (RFC 5321 4.1.2) + # allows @-signs (and escaped quotes) to appear in the local + # part if the local part is quoted. If the address is quoted, + # split it at a non-escaped @-sign and unescape the escaping. + quoted_local_part = False + m = QUOTED_LOCAL_PART_ADDR.match(email) + if m: + quoted_local_part = True + local_part, domain_part = m.groups() + + # Remove backslashes. + import re + local_part = re.sub(r"\\(.)", "\\1", local_part) + + else: + # Split at the one and only at-sign. + parts = email.split('@') + if len(parts) != 2: + raise EmailSyntaxError("The email address is not valid. It must have exactly one @-sign.") + local_part, domain_part = parts # Collect return values in this instance. ret = ValidatedEmail() ret.original_email = email # Validate the email address's local part syntax and get a normalized form. - local_part_info = validate_email_local_part(parts[0], + # If the original address was quoted and the decoded local part is a valid + # unquoted local part, then we'll get back a normalized (unescaped) local + # part. + local_part_info = validate_email_local_part(local_part, allow_smtputf8=allow_smtputf8, - allow_empty_local=allow_empty_local) + allow_empty_local=allow_empty_local, + quoted_local_part=quoted_local_part) + if quoted_local_part and not allow_quoted_local: + raise EmailSyntaxError("Quoting the part before the @-sign is not allowed here.") ret.local_part = local_part_info["local_part"] ret.ascii_local_part = local_part_info["ascii_local_part"] ret.smtputf8 = local_part_info["smtputf8"] # Validate the email address's domain part syntax and get a normalized form. - domain_part_info = validate_email_domain_part(parts[1], test_environment=test_environment, globally_deliverable=globally_deliverable) + domain_part_info = validate_email_domain_part(domain_part, test_environment=test_environment, globally_deliverable=globally_deliverable) ret.domain = domain_part_info["domain"] ret.ascii_domain = domain_part_info["ascii_domain"] diff --git a/tests/test_syntax.py b/tests/test_syntax.py index acef671..1032b72 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -224,21 +224,29 @@ def test_email_valid_intl_local_part(email_input, output): @pytest.mark.parametrize( - 'email_input,error_msg', + 'email_input,normalized_local_part', [ - ('"unnecessarily.quoted.local.part"@example.com', 'The email address contains invalid characters before the @-sign: \'"\'.'), - ('"quoted..local.part"@example.com', 'The email address contains invalid characters before the @-sign: \'"\'.'), - ('"quoted.with.at@"@example.com', 'The email address is not valid. It must have exactly one @-sign.'), - ('"quoted with space"@example.com', 'The email address contains invalid characters before the @-sign: \'\"\', SPACE.'), - ('"quoted.with.dquote\\""@example.com', 'The email address contains invalid characters before the @-sign: "\\", \'"\'.'), - ('"unnecessarily.quoted.with.unicode.λ"@example.com', 'The email address contains invalid characters before the @-sign: \'"\'.'), - ('"quoted.with..unicode.λ"@example.com', 'The email address contains invalid characters before the @-sign: \'"\'.'), - ('"quoted.with.extraneous.\\escape"@example.com', 'The email address contains invalid characters before the @-sign: "\\", \'"\'.'), + ('"unnecessarily.quoted.local.part"@example.com', 'unnecessarily.quoted.local.part'), + ('"quoted..local.part"@example.com', '"quoted..local.part"'), + ('"quoted.with.at@"@example.com', '"quoted.with.at@"'), + ('"quoted with space"@example.com', '"quoted with space"'), + ('"quoted.with.dquote\\""@example.com', '"quoted.with.dquote\\""'), + ('"unnecessarily.quoted.with.unicode.λ"@example.com', 'unnecessarily.quoted.with.unicode.λ'), + ('"quoted.with..unicode.λ"@example.com', '"quoted.with..unicode.λ"'), + ('"quoted.with.extraneous.\\escape"@example.com', 'quoted.with.extraneous.escape'), ]) -def test_email_valid_only_if_quoted_local_part(email_input, error_msg): +def test_email_valid_only_if_quoted_local_part(email_input, normalized_local_part): + # These addresses are invalid with the default allow_quoted_local=False option. with pytest.raises(EmailSyntaxError) as exc_info: validate_email(email_input) - assert str(exc_info.value) == error_msg + assert str(exc_info.value) == 'Quoting the part before the @-sign is not allowed here.' + + # But they are valid if quoting is allowed. + validated = validate_email(email_input, allow_quoted_local=True, check_deliverability=False) + + # Check that the normalized form correctly removed unnecessary backslash escaping + # and even the quoting if they weren't necessary. + assert validated.local_part == normalized_local_part @pytest.mark.parametrize( @@ -356,6 +364,7 @@ def test_email_unsafe_character(s, expected_error): ('email_input', 'expected_error'), [ ('λambdaツ@test', 'Internationalized characters before the @-sign are not supported: \'λ\', \'ツ\'.'), + ('"quoted.with..unicode.λ"@example.com', 'Internationalized characters before the @-sign are not supported: \'λ\'.'), ], ) def test_email_invalid_character_smtputf8_off(email_input, expected_error): @@ -424,7 +433,7 @@ def test_email_test_domain_name_in_test_environment(): ['a@abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghikl.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghikl.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghikl.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefg.hij', 'ISEMAIL_RFC5322_TOOLONG'], ['a@abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghikl.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghikl.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghikl.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefg.hijk', 'ISEMAIL_RFC5322_DOMAIN_TOOLONG'], ['"test"@iana.org', 'ISEMAIL_RFC5321_QUOTEDSTRING'], - ['""@iana.org', 'ISEMAIL_RFC5321_QUOTEDSTRING'], + # ['""@iana.org', 'ISEMAIL_RFC5321_QUOTEDSTRING'], # we think an empty quoted string should be invalid ['"""@iana.org', 'ISEMAIL_ERR_EXPECTING_ATEXT'], ['"\\a"@iana.org', 'ISEMAIL_RFC5321_QUOTEDSTRING'], ['"\\""@iana.org', 'ISEMAIL_RFC5321_QUOTEDSTRING'], @@ -549,6 +558,13 @@ def test_pyisemail_tests(email_input, status): if status == "ISEMAIL_VALID": # All standard email address forms should not raise an exception. validate_email(email_input, test_environment=True) + + elif status == "ISEMAIL_RFC5321_QUOTEDSTRING": + # Only valid with an option. + with pytest.raises(EmailSyntaxError): + validate_email(email_input, test_environment=True) + validate_email(email_input, allow_quoted_local=True, test_environment=True) + elif "_ERR_" in status or "_TOOLONG" in status \ or "_CFWS_FWS" in status or "_CFWS_COMMENT" in status \ or "_IPV6" in status or status == "ISEMAIL_RFC5322_DOMAIN": @@ -557,13 +573,14 @@ def test_pyisemail_tests(email_input, status): # The ISEMAIL_RFC5322_DOMAIN diagnosis appears to be a syntactically invalid domain. with pytest.raises(EmailSyntaxError): validate_email(email_input, test_environment=True) + elif "_DEPREC_" in status \ - or "RFC5321_QUOTEDSTRING" in status \ or "DOMAINLITERAL" in status or "_DOMLIT_" in status or "_ADDRESSLITERAL" in status: - # Quoted strings in the local part, domain literals (IP addresses in brackets), + # Domain literals (IP addresses in brackets) # and other deprecated syntax are valid email addresses and are accepted by pyIsEmail, # but we reject them. with pytest.raises(EmailSyntaxError): validate_email(email_input, test_environment=True) + else: raise ValueError(f"status {status} is not recognized") From 688a2638f1b45784803e3643a53aea6b37472dc7 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Sun, 9 Apr 2023 10:54:47 -0400 Subject: [PATCH 2/2] In the __main__ tool read options to validate_email from environment variables --- CHANGELOG.md | 1 + email_validator/__main__.py | 18 ++++++++++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ff57248..c52ed57 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ There are no significant changes to which email addresses are considered valid/i * Some other error messages have changed to not repeat the email address in the error message. * The library has been reorganized internally into smaller modules. * The tests have been reorganized and expanded. Deliverability tests now mostly use captured DNS responses so they can be run off-line. +* The __main__ tool now reads options to validate_email from environment variables. * Type annotations have been added to the exported methods and the ValidatedEmail class and some internal methods. Version 1.3.1 (January 21, 2023) diff --git a/email_validator/__main__.py b/email_validator/__main__.py index 9330553..52d1054 100644 --- a/email_validator/__main__.py +++ b/email_validator/__main__.py @@ -10,8 +10,12 @@ # invalid email addresses. When passing an email address on the command # line, if the email address is valid, information about it will be printed. # When using STDIN, no output will be given for valid email addresses. +# +# Keyword arguments to validate_email can be set in environment variables +# of the same name but upprcase (see below). import json +import os import sys from .validate_email import validate_email @@ -22,20 +26,30 @@ def main(dns_resolver=None): # The dns_resolver argument is for tests. + # Set options from environment variables. + options = {} + for varname in ('ALLOW_SMTPUTF8', 'ALLOW_QUOTED_LOCAL', 'GLOBALLY_DELIVERABLE', + 'CHECK_DELIVERABILITY', 'TEST_ENVIRONMENT'): + if varname in os.environ: + options[varname.lower()] = bool(os.environ[varname]) + for varname in ('DEFAULT_TIMEOUT',): + if varname in os.environ: + options[varname.lower()] = float(os.environ[varname]) + if len(sys.argv) == 1: # Validate the email addresses pased line-by-line on STDIN. dns_resolver = dns_resolver or caching_resolver() for line in sys.stdin: email = line.strip() try: - validate_email(email, dns_resolver=dns_resolver) + validate_email(email, dns_resolver=dns_resolver, **options) except EmailNotValidError as e: print(f"{email} {e}") else: # Validate the email address passed on the command line. email = sys.argv[1] try: - result = validate_email(email, dns_resolver=dns_resolver) + result = validate_email(email, dns_resolver=dns_resolver, **options) print(json.dumps(result.as_dict(), indent=2, sort_keys=True, ensure_ascii=False)) except EmailNotValidError as e: print(e)