Source code for whenever._parse

import sys
from datetime import (
    date as _date,
    datetime as _datetime,
    time as _time,
    timedelta as _timedelta,
    timezone as _timezone,
)
from typing import Literal, NoReturn, cast

from ._common import (
    DUMMY_LEAP_YEAR,
    UTC,
    Nanos,
    check_utc_bounds,
    mk_fixed_tzinfo,
)
from ._tz import SafeTzId, TimeZone, get_tz, resolve_ambiguity, validate_tzid



[docs]
class InvalidOffsetError(ValueError):
    """A string has an invalid offset for the given zone"""



def _parse_err(s: str) -> NoReturn:
    raise ValueError(f"Invalid format: {s!r}") from None


def _parse_nanos(s: str) -> Nanos:
    if len(s) > 9 or not s.isdigit() or not s.isascii():
        raise ValueError("Invalid decimals")
    return int(s.ljust(9, "0"))


def _split_nextchar(
    s: str, chars: str, start: int = 0, end: int = -1
) -> tuple[str, str | None, str]:
    for c in chars:
        if (idx := s.find(c, start, end)) != -1:
            return (s[:idx], c, s[idx + 1 :])
    return (s, None, "")


_is_sep = " Tt".__contains__


def _offset_from_iso(s: str) -> int:
    minutes = 0
    seconds = 0
    if len(s) == 5 and s[2] == ":" and s[3] < "6":  # most common: HH:MM
        hours = int(s[:2])
        minutes = int(s[3:])
    elif len(s) == 4 and s[2] < "6":  # HHMM
        hours = int(s[:2])
        minutes = int(s[2:])
    elif len(s) == 2:  # HH
        hours = int(s)
    elif (
        len(s) == 8
        and s[2] == ":"
        and s[5] == ":"
        and s[3] < "6"
        and s[6] < "6"
    ):  # HH:MM:SS
        hours = int(s[:2])
        minutes = int(s[3:5])
        seconds = int(s[6:])
    elif len(s) == 6 and s[2] < "6" and s[4] < "6":  # HHMMSS
        hours = int(s[:2])
        minutes = int(s[2:4])
        seconds = int(s[4:])
    else:
        raise ValueError("Invalid offset format")
    return hours * 3600 + minutes * 60 + seconds


def datetime_from_iso(s: str) -> tuple[_datetime, Nanos]:
    if len(s) < 11 or "W" in s or not s.isascii():
        _parse_err(s)

    # OPTIMIZE: the happy path can be faster
    try:
        if _is_sep(s[10]):  # date in extended format
            rest, date = s[11:], _date.fromisoformat(s[:10])
        elif _is_sep(s[8]):  # date in basic format
            rest, date = s[9:], _date_from_iso_basic(s[:8])
        else:
            _parse_err(s)
        time, nanos = time_from_iso(rest)
    except ValueError:
        _parse_err(s)

    return _datetime.combine(date, time), nanos


def offset_dt_from_iso(s: str) -> tuple[_datetime, Nanos]:
    if len(s) < 11 or "W" in s[:11] or not s.isascii():
        _parse_err(s)

    try:
        if _is_sep(s[10]):  # date in extended format
            rest, date = s[11:], _date.fromisoformat(s[:10])
        elif _is_sep(s[8]):  # date in basic format
            rest, date = s[9:], _date_from_iso_basic(s[:8])
        else:
            _parse_err(s)
        time, nanos, offset, _ = _time_offset_tz_from_iso(rest)
        if offset is None:
            raise ValueError("Missing offset")
        elif offset == "Z":
            tzinfo = UTC
        else:
            assert isinstance(offset, _timezone)
            tzinfo = offset

        return (
            check_utc_bounds(_datetime.combine(date, time, tzinfo)),
            nanos,
        )
    except ValueError:
        _parse_err(s)


def zdt_from_iso(s: str) -> tuple[_datetime, Nanos, TimeZone]:
    if len(s) < 11 or "W" in s[:11] or not s.isascii():
        _parse_err(s)

    try:
        if _is_sep(s[10]):  # date in extended format
            rest, date = s[11:], _date.fromisoformat(s[:10])
        elif _is_sep(s[8]):  # date in basic format
            rest, date = s[9:], _date_from_iso_basic(s[:8])
        else:
            _parse_err(s)
        time, nanos, offset, tzid = _time_offset_tz_from_iso(rest)
    except ValueError:
        _parse_err(s)

    if tzid is None:
        _parse_err(s)

    tz = get_tz(tzid)

    if offset is None:
        dt = resolve_ambiguity(_datetime.combine(date, time), tz, "compatible")
    elif offset == "Z":
        utc_dt = _datetime.combine(date, time, UTC)
        dt = utc_dt.astimezone(
            mk_fixed_tzinfo(tz.offset_for_instant(int(utc_dt.timestamp())))
        )
    else:
        assert isinstance(offset, _timezone)
        dt = _datetime.combine(date, time, offset)
        # Raise an exception if instant is out of range
        dt.astimezone(UTC)
        # Ensure the offset is correct for the given instant
        expected_offset = tz.offset_for_instant(int(dt.timestamp()))
        # NOTE: mypy doesn't know utcoffset() can never return None here
        if dt.utcoffset().total_seconds() != expected_offset:  # type: ignore[union-attr]
            raise InvalidOffsetError()

    return (dt, nanos, tz)


def time_from_iso(s_orig: str) -> tuple[_time, Nanos]:
    s, sep, nanos_raw = _split_nextchar(s_orig, ".,", 6, 9)

    try:
        return (
            _time_from_iso_nofrac(s),
            _parse_nanos(nanos_raw) if sep else 0,
        )
    except ValueError:
        _parse_err(s_orig)


# Parse the time, UTC offset, and timezone ID
def _time_offset_tz_from_iso(
    s: str,
) -> tuple[_time, Nanos, _timezone | Literal["Z"] | None, SafeTzId | None]:
    # ditch the bracketted timezone (if present)
    if s.endswith("]"):
        # NOTE: sorry for the unicode escape sequences. Literal brackets
        # break my LSP's indentation detection. \x5b is open bracket '['
        s, tz_raw = s[:-1].rsplit("\x5b", 1)
        tz = validate_tzid(tz_raw)
    else:
        tz = None

    # determine the offset
    offset: Literal["Z"] | _timezone | None
    if s.endswith(("Z", "z")):
        s_time = s[:-1]
        offset = "Z"
    else:
        s_time, sign, s_offset = _split_nextchar(s, "+-")
        if sign is None:
            offset = None
        else:
            offset_secs = _offset_from_iso(s_offset)
            if sign == "-":
                offset_secs = -offset_secs
            offset = mk_fixed_tzinfo(offset_secs)

    time, nanos = time_from_iso(s_time)
    return (time, nanos, offset, tz)


def yearmonth_from_iso(s: str) -> _date:
    if not s.isascii():
        _parse_err(s)
    try:
        if len(s) == 7 and s[4] == "-":
            year, month = int(s[:4]), int(s[5:])
        elif len(s) == 6:
            year, month = int(s[:4]), int(s[4:])
        else:
            _parse_err(s)
        return _date(year, month, 1)
    except ValueError:
        _parse_err(s)


def monthday_from_iso(s: str) -> _date:
    if not (s.startswith("--") and s.isascii()):
        _parse_err(s)
    try:
        if len(s) == 7 and s[4] == "-":
            month, day = int(s[2:4]), int(s[5:])
        elif len(s) == 6:
            month, day = int(s[2:4]), int(s[4:])
        else:
            _parse_err(s)
        return _date(DUMMY_LEAP_YEAR, month, day)
    except ValueError:
        _parse_err(s)


# The ISO parsing functions were improved in Python 3.11,
# so we use them if available.
if sys.version_info >= (3, 11):

    _date_from_iso_basic = _date.fromisoformat

    def _time_from_iso_nofrac(s: str) -> _time:
        # Compensate for a bug in CPython where times like "12:34:56:78" are
        # accepted as valid times. This is only fixed in Python 3.14+
        if s.count(":") > 2:
            raise ValueError()
        if all(map("0123456789:".__contains__, s)):
            return _time.fromisoformat(s)
        raise ValueError()

    def date_from_iso(s: str) -> _date:
        # prevent isoformat from parsing stuff we don't want it to
        if "W" in s or not s.isascii():
            _parse_err(s)
        try:
            return _date.fromisoformat(s)
        except ValueError:
            _parse_err(s)

else:  # pragma: no cover

    def _date_from_iso_basic(s: str, /) -> _date:
        return _date.fromisoformat(s[:4] + "-" + s[4:6] + "-" + s[6:8])

    def _time_from_iso_nofrac(s: str) -> _time:
        # Compensate for the fact that Python's isoformat
        # doesn't support basic ISO 8601 formats
        if len(s) == 4:
            s = s[:2] + ":" + s[2:]
        elif len(s) == 6:
            s = s[:2] + ":" + s[2:4] + ":" + s[4:]
        if all(map("0123456789:".__contains__, s)):
            return _time.fromisoformat(s)
        raise ValueError()

    def date_from_iso(s: str) -> _date:
        if not s.isascii():
            _parse_err(s)
        try:
            if len(s) == 8:
                return _date_from_iso_basic(s)
            return _date.fromisoformat(s)
        except ValueError:
            _parse_err(s)


_RFC2822_WEEKDAY_TO_ISO = {
    "mon": 1,
    "tue": 2,
    "wed": 3,
    "thu": 4,
    "fri": 5,
    "sat": 6,
    "sun": 7,
}


_RFC2822_MONTH_NAMES = {
    "jan": 1,
    "feb": 2,
    "mar": 3,
    "apr": 4,
    "may": 5,
    "jun": 6,
    "jul": 7,
    "aug": 8,
    "sep": 9,
    "oct": 10,
    "nov": 11,
    "dec": 12,
}

WEEKDAY_TO_RFC2822 = [s.title() for s in _RFC2822_WEEKDAY_TO_ISO]
MONTH_TO_RFC2822 = [s.title() for s in _RFC2822_MONTH_NAMES]
MONTH_TO_RFC2822.insert(0, "")  # 1-indexed

_RFC2822_ZONES = {
    "EST": -5,
    "EDT": -4,
    "CST": -6,
    "CDT": -5,
    "MST": -7,
    "MDT": -6,
    "PST": -8,
    "PDT": -7,
    "UT": 0,
    "GMT": 0,
}


def parse_rfc2822(s: str) -> _datetime:
    # Technically, only tab, space and CRLF are allowed in RFC2822,
    # but we allow any ASCII whitespace
    if not s.isascii():
        _parse_err(s)

    # Parse the weekday
    try:
        first, second, *parts = s.split()
        if first.isdigit():
            iso_weekday = None
            parts = [first, second, *parts]
        else:
            # Case: Mon, 23 Jan
            if len(first) == 4 and first[3] == ",":
                weekday_raw = first[:3]
                parts = [second, *parts]
            # Case: Mon , 23 Jan
            elif len(first) == 3 and second == ",":
                weekday_raw = first
            # Case: Mon ,23 Jan
            elif len(first) == 3 and second.startswith(","):
                weekday_raw = first
                parts = [second[1:], *parts]
            # Case: Mon,23 Jan
            elif len(first) > 4 and first[3] == ",":
                weekday_raw = first[:3]
                parts = [first[4:], second, *parts]
            else:
                _parse_err(s)

            iso_weekday = _RFC2822_WEEKDAY_TO_ISO[weekday_raw.lower()]
    except (ValueError, KeyError):
        _parse_err(s)

    # Parse the date
    try:
        day_raw, month_raw, year_raw, *parts = parts
        if len(day_raw) > 2:
            _parse_err(s)
        day = int(day_raw)
        month = _RFC2822_MONTH_NAMES[month_raw.lower()]
        if len(year_raw) == 4:
            year = int(year_raw)
        elif len(year_raw) == 2:
            year = int(year_raw)
            if year < 50:
                year += 2000
            else:
                year += 1900
        elif len(year_raw) == 3:
            year = int(year_raw) + 1900
        else:
            _parse_err(s)
        date = _date(year, month, day)
    except (ValueError, KeyError):
        _parse_err(s)

    if iso_weekday and iso_weekday != date.isoweekday():
        _parse_err(s)

    # Parse the time
    try:
        # time components may be separated by whitespace
        *time_parts, offset_raw = parts
        time_raw = "".join(time_parts)
        if len(time_raw) == 5 and time_raw[2] == ":":
            time = _time(int(time_raw[:2]), int(time_raw[3:]))
        elif len(time_raw) == 8 and time_raw[2] == ":" and time_raw[5] == ":":
            time = _time(
                int(time_raw[:2]), int(time_raw[3:5]), int(time_raw[6:])
            )
        else:
            _parse_err(s)
    except ValueError:
        _parse_err(s)

    # Parse the offset
    try:
        if offset_raw.startswith(("+", "-")) and len(offset_raw) == 5:
            sign = 1 if offset_raw[0] == "+" else -1
            offset = (
                _timedelta(
                    hours=int(offset_raw[1:3]), minutes=int(offset_raw[3:5])
                )
                * sign
            )
        elif offset_raw.isalpha():
            # According to the spec, unknown timezones should
            # just be treated at -0000 (UTC with unknown offset)
            offset = _timedelta(
                hours=_RFC2822_ZONES.get(offset_raw.upper(), 0)
            )
        else:
            _parse_err(s)
        tzinfo = _timezone(offset)
    except ValueError:
        _parse_err(s)

    return check_utc_bounds(_datetime.combine(date, time, tzinfo=tzinfo))


_MAX_TDELTA_DIGITS = 35  # consistent with Rust extension


def parse_timedelta_component(
    fullstr: str, exc: Exception
) -> tuple[str, int, Literal["H", "M", "S"]]:
    try:
        split_index, unit = next(
            (i, c) for i, c in enumerate(fullstr) if c in "HMS"
        )
    except StopIteration:
        raise exc

    raw, rest = fullstr[:split_index], fullstr[split_index + 1 :]

    if unit == "S":
        digits, sep, nanos_raw = _split_nextchar(raw, ".,")

        if (
            len(digits) > _MAX_TDELTA_DIGITS
            or not digits.isdigit()
            or len(nanos_raw) > 9
            or (sep and not nanos_raw.isdigit())
        ):
            raise exc

        value = int(digits) * 1_000_000_000 + int(nanos_raw.ljust(9, "0"))
    else:
        if len(raw) > _MAX_TDELTA_DIGITS or not raw.isdigit():
            raise exc
        value = int(raw)

    return rest, value, cast(Literal["H", "M", "S"], unit)