Source code for javaproperties.reading

from   __future__  import unicode_literals
from   collections import namedtuple
import re
from   struct      import pack
from   six         import binary_type, StringIO, BytesIO, text_type, unichr
from   .util       import CONTINUED_RGX, ascii_splitlines

[docs]def load(fp, object_pairs_hook=dict):
    """
    Parse the contents of the `~io.IOBase.readline`-supporting file-like object
    ``fp`` as a simple line-oriented ``.properties`` file and return a `dict`
    of the key-value pairs.

    ``fp`` may be either a text or binary filehandle, with or without universal
    newlines enabled.  If it is a binary filehandle, its contents are decoded
    as Latin-1.

    By default, the key-value pairs extracted from ``fp`` are combined into a
    `dict` with later occurrences of a key overriding previous occurrences of
    the same key.  To change this behavior, pass a callable as the
    ``object_pairs_hook`` argument; it will be called with one argument, a
    generator of ``(key, value)`` pairs representing the key-value entries in
    ``fp`` (including duplicates) in order of occurrence.  `load` will then
    return the value returned by ``object_pairs_hook``.

    .. versionchanged:: 0.5.0
        Invalid ``\\uXXXX`` escape sequences will now cause an
        `InvalidUEscapeError` to be raised

    :param fp: the file from which to read the ``.properties`` document
    :type fp: file-like object
    :param callable object_pairs_hook: class or function for combining the
        key-value pairs
    :rtype: `dict` of text strings or the return value of ``object_pairs_hook``
    :raises InvalidUEscapeError: if an invalid ``\\uXXXX`` escape sequence
        occurs in the input
    """
    return object_pairs_hook(
        (kv.key, kv.value) for kv in parse(fp) if isinstance(kv, KeyValue)
    )

[docs]def loads(s, object_pairs_hook=dict):
    """
    Parse the contents of the string ``s`` as a simple line-oriented
    ``.properties`` file and return a `dict` of the key-value pairs.

    ``s`` may be either a text string or bytes string.  If it is a bytes
    string, its contents are decoded as Latin-1.

    By default, the key-value pairs extracted from ``s`` are combined into a
    `dict` with later occurrences of a key overriding previous occurrences of
    the same key.  To change this behavior, pass a callable as the
    ``object_pairs_hook`` argument; it will be called with one argument, a
    generator of ``(key, value)`` pairs representing the key-value entries in
    ``s`` (including duplicates) in order of occurrence.  `loads` will then
    return the value returned by ``object_pairs_hook``.

    .. versionchanged:: 0.5.0
        Invalid ``\\uXXXX`` escape sequences will now cause an
        `InvalidUEscapeError` to be raised

    :param string s: the string from which to read the ``.properties`` document
    :param callable object_pairs_hook: class or function for combining the
        key-value pairs
    :rtype: `dict` of text strings or the return value of ``object_pairs_hook``
    :raises InvalidUEscapeError: if an invalid ``\\uXXXX`` escape sequence
        occurs in the input
    """
    fp = BytesIO(s) if isinstance(s, binary_type) else StringIO(s)
    return load(fp, object_pairs_hook=object_pairs_hook)

TIMESTAMP_RGX = re.compile(
    r'\A[ \t\f]*[#!][ \t\f]*'
    r'(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)'
    r' (?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)'
    r' (?:[012][0-9]|3[01])'
    r' (?:[01][0-9]|2[0-3]):[0-5][0-9]:(?:[0-5][0-9]|6[01])'
    r' (?:[A-Za-z_0-9]{3})?'
    r' [0-9]{4,}'
    r'[ \t\f]*\r?\n?\Z'
)

[docs]class PropertiesElement(object):
    """
    .. versionadded:: 0.7.0

    Superclass of objects returned by `parse()`
    """
    __slots__ = ()


[docs]class Comment(PropertiesElement, namedtuple('Comment', 'source')):
    """
    .. versionadded:: 0.7.0

    Subclass of `PropertiesElement` representing a comment

    .. attribute:: source

        The raw, unmodified input line (including trailing newlines)
    """

    __slots__ = ()

    @property
    def value(self):
        """
        Returns the contents of the comment, with the comment marker, any
        whitespace leading up to it, and the trailing newline removed
        """
        s = self.source.lstrip(' \t\f')
        if s.startswith(('#', '!')):
            s = s[1:]
        return s.rstrip('\r\n')

    @property
    def source_stripped(self):
        """
        Like `source`, but with the final trailing newline (if any) removed
        """
        return self.source.rstrip('\r\n')

[docs]    def is_timestamp(self):
        """
        Returns `True` iff the comment's value appears to be a valid timestamp
        as produced by Java 8's ``Date.toString()``
        """
        return bool(TIMESTAMP_RGX.match(self.source))


[docs]class Whitespace(PropertiesElement, namedtuple('Whitespace', 'source')):
    """
    .. versionadded:: 0.7.0

    Subclass of `PropertiesElement` representing a line that is either empty or
    contains only whitespace (and possibly some line continuations)

    .. attribute:: source

        The raw, unmodified input line (including trailing newlines)
    """

    __slots__ = ()

    @property
    def source_stripped(self):
        """
        Like `source`, but with the final trailing newline and line
        continuation (if any) removed
        """
        s = self.source.rstrip('\r\n')
        if CONTINUED_RGX.search(s):
            s = s[:-1]
        return s


[docs]class KeyValue(PropertiesElement, namedtuple('KeyValue', 'key value source')):
    """
    .. versionadded:: 0.7.0

    Subclass of `PropertiesElement` representing a key-value entry

    .. attribute:: key

        The entry's key, after processing escape sequences

    .. attribute:: value

        The entry's value, after processing escape sequences

    .. attribute:: source

        The concatenation of the raw, unmodified lines in the input (including
        trailing newlines) from which the key and value were extracted
    """

    __slots__ = ()

    @property
    def source_stripped(self):
        """
        Like `source`, but with the final trailing newline and line
        continuation (if any) removed
        """
        s = self.source.rstrip('\r\n')
        if CONTINUED_RGX.search(s):
            s = s[:-1]
        return s


COMMENT_RGX = re.compile(r'^[ \t\f]*[#!]')
BLANK_RGX = re.compile(r'^[ \t\f]*\r?\n?\Z')
SEPARATOR_RGX = re.compile(r'(?<!\\)(?:\\\\)*([ \t\f]*[=:]|[ \t\f])[ \t\f]*')

[docs]def parse(src):
    """
    Parse the given data as a simple line-oriented ``.properties`` file and
    return a generator of `PropertiesElement` objects representing the
    key-value pairs (as `KeyValue` objects), comments (as `Comment` objects),
    and blank lines (as `Whitespace` objects) in the input in order of
    occurrence.

    If the same key appears multiple times in the input, a separate `KeyValue`
    object is emitted for each entry.

    ``src`` may be a text string, a bytes string, or a text or binary
    filehandle/file-like object supporting the `~io.IOBase.readline` method
    (with or without universal newlines enabled).  Bytes input is decoded as
    Latin-1.


    .. versionchanged:: 0.5.0
        Invalid ``\\uXXXX`` escape sequences will now cause an
        `InvalidUEscapeError` to be raised

    .. versionchanged:: 0.7.0
        `parse()` now accepts strings as input, and it now returns a generator
        of custom objects instead of triples of strings

    :param src: the ``.properties`` document
    :type src: string or file-like object
    :rtype: Iterable[PropertiesElement]
    :raises InvalidUEscapeError: if an invalid ``\\uXXXX`` escape sequence
        occurs in the input
    """
    if isinstance(src, binary_type):
        liter = iter(ascii_splitlines(src.decode('iso-8859-1')))
    elif isinstance(src, text_type):
        liter = iter(ascii_splitlines(src))
    else:
        def lineiter():
            while True:
                ln = src.readline()
                if isinstance(ln, binary_type):
                    ln = ln.decode('iso-8859-1')
                if ln == '':
                    return
                for l in ascii_splitlines(ln):
                    yield l
        liter = lineiter()
    for source in liter:
        line = source
        if COMMENT_RGX.match(line):
            yield Comment(source)
            continue
        elif BLANK_RGX.match(line):
            yield Whitespace(source)
            continue
        line = line.lstrip(' \t\f').rstrip('\r\n')
        while CONTINUED_RGX.search(line):
            line = line[:-1]
            nextline = next(liter, '')
            source += nextline
            line += nextline.lstrip(' \t\f').rstrip('\r\n')
        if line == '':  # series of otherwise-blank lines with continuations
            yield Whitespace(source)
            continue
        m = SEPARATOR_RGX.search(line)
        if m:
            yield KeyValue(
                unescape(line[:m.start(1)]),
                unescape(line[m.end():]),
                source,
            )
        else:
            yield KeyValue(unescape(line), '', source)

SURROGATE_PAIR_RGX = re.compile(r'[\uD800-\uDBFF][\uDC00-\uDFFF]')
ESCAPE_RGX = re.compile(r'\\(u.{0,4}|.)')
U_ESCAPE_RGX = re.compile(r'^u[0-9A-Fa-f]{4}\Z')

[docs]def unescape(field):
    """
    Decode escape sequences in a ``.properties`` key or value.  The following
    escape sequences are recognized::

        \\t \\n \\f \\r \\uXXXX \\\\

    If a backslash is followed by any other character, the backslash is
    dropped.

    In addition, any valid UTF-16 surrogate pairs in the string after
    escape-decoding are further decoded into the non-BMP characters they
    represent.  (Invalid & isolated surrogate code points are left as-is.)

    .. versionchanged:: 0.5.0
        Invalid ``\\uXXXX`` escape sequences will now cause an
        `InvalidUEscapeError` to be raised

    :param field: the string to decode
    :type field: text string
    :rtype: text string
    :raises InvalidUEscapeError: if an invalid ``\\uXXXX`` escape sequence
        occurs in the input
    """
    return SURROGATE_PAIR_RGX.sub(_unsurrogate, ESCAPE_RGX.sub(_unesc, field))

_unescapes = {'t': '\t', 'n': '\n', 'f': '\f', 'r': '\r'}

def _unesc(m):
    esc = m.group(1)
    if esc[0] == 'u':
        if not U_ESCAPE_RGX.match(esc):
            # We can't rely on `int` failing, because it succeeds when `esc`
            # has trailing whitespace or a leading minus.
            raise InvalidUEscapeError('\\' + esc)
        return unichr(int(esc[1:], 16))
    else:
        return _unescapes.get(esc, esc)

def _unsurrogate(m):
    c,d = map(ord, m.group())
    uord = ((c - 0xD800) << 10) + (d - 0xDC00) + 0x10000
    try:
        return unichr(uord)
    except ValueError:
        # Narrow Python build (only a thing pre-3.3)
        return pack('i', uord).decode('utf-32')


[docs]class InvalidUEscapeError(ValueError):
    """
    .. versionadded:: 0.5.0

    Raised when an invalid ``\\uXXXX`` escape sequence (i.e., a ``\\u`` not
    immediately followed by four hexadecimal digits) is encountered in a simple
    line-oriented ``.properties`` file
    """

    def __init__(self, escape):
        #: The invalid ``\uXXXX`` escape sequence encountered
        self.escape = escape
        super(InvalidUEscapeError, self).__init__(escape)

    def __str__(self):
        return 'Invalid \\u escape sequence: ' + self.escape