Source code for binflakes.sexpr.read

import re
from enum import Enum
from io import StringIO

from attr import attrs, attrib
from attr.validators import instance_of

from binflakes.types import BinWord, BinArray
from .string import ESCAPE_TO_CHAR
from .symbol import Symbol
from .location import TextLocationSingle, TextLocationRange
from .nodes import GenericNode


[docs]class ReadError(Exception):
    """An exception class used for all problems noticed by the reader."""
    pass


[docs]class State(Enum):
    """Represents the reader state (between tokens, inside a string token,
    inside a BinArray token.
    """
    NORMAL = 'normal'
    STRING = 'string'
    BINARRAY = 'binarray'


[docs]@attrs(slots=True)
class StackEntryList:
    """A reader stack entry representing a list currently being parsed.
    ``items`` are the items parsed so far, ``start`` is the location of
    the opening paren.
    """
    start = attrib(validator=instance_of(TextLocationSingle))
    items = attrib(validator=instance_of(list))

    def raise_unclosed_error(self):
        raise ReadError(f'{self.start}: unmatched opening paren')


[docs]@attrs(slots=True)
class StackEntryComment:
    """A reader stack entry representing a commented-out S-expression
    currently being parsed. ``start`` is the location of the opening
    comment sign.
    """
    start = attrib(validator=instance_of(TextLocationRange))

    def raise_unclosed_error(self):
        raise ReadError(f'{self.start}: unclosed S-expr comment')


RE_TOKEN = re.compile(r'''
    # Any amount of whitespace.
    (?P<whitespace>[ \t\r\n\f]+) |
    # Line comment (hash followed by space).
    (?P<line_comment>\#\ .*$) |
    # Left paren.
    (?P<lparen>\() |
    # Start of string (switches parser to STRING state).
    (?:(?P<string_width>[0-9]+)')? (?P<start_quote>") |
    # Start of BinArray (switches parser to BINARRAY state).
    (?P<array_width>[0-9]+)'(?P<array_base>0[box])?\( |
    # These tokens must be followed by whitespace, end of line,
    # or a right paren.
    (?:
        # A right paren.
        (?P<rparen>\)) |
        # The singletons.
        (?P<nil_value>@nil) |
        (?P<bool_value>@true|@false) |
        # Ints and words.
        (?:(?P<word_width>[0-9]+)')? (?P<int_or_word>
            (?P<number>
                -? 0b [0-1]+ |
                -? 0o [0-7]+ |
                -? 0x [0-9a-fA-F]+ |
                -? [1-9][0-9]* |
                -? 0
            ) |
            '(?:
                # Simple unescaped character.
                (?P<raw_char>[^\\']) |
                # A single-character escape.
                \\(?P<simple_escape>[abtnfre\\"]) |
                # A hex character escape.
                \\[xuU](?P<hex_code>
                    (?<=x)[0-9a-fA-F]{2} |
                    (?<=u)[0-9a-fA-F]{4} |
                    (?<=U)[0-9a-fA-F]{6}
                )
            )'
        ) |
        # Symbols.
        (?P<symbol>
            [a-zA-Z*+=<>!?/$%_][0-9a-zA-Z*+=<>!?/$%_-]* |
            -
        )
    )(?= $ | [ \t\r\n\f)] | (?P<ws_error>)) |
    # S-expr comment.
    (?P<sexpr_comment>\#\#)
''', re.VERBOSE)

RE_STRING_ITEM = re.compile(r'''
    # End of string (must be followed by whitespace, end of line,
    # or right paren.
    (?P<end_quote>") (?= $ | [ \t\r\n\f)] | (?P<ws_error>)) |
    # Simple unescaped characters.
    (?P<raw_chars>[^\\"]+) |
    # A single-character escape.
    \\(?P<simple_escape>[abtnfre\\"]) |
    # A hex character escape.
    \\[xuU](?P<hex_code>
        (?<=x)[0-9a-fA-F]{2} |
        (?<=u)[0-9a-fA-F]{4} |
        (?<=U)[0-9a-fA-F]{6}
    )
''', re.VERBOSE)


# Patterns common to all BINARRAY bases.
def _re_binarray_item(digits):
    return re.compile(r'''
        # Any amount of whitespace.
        (?P<whitespace>[ \t\r\n\f]+) |
        # Line comment.
        (?P<line_comment>\# .*$) |
        # These tokens must be followed by whitespace, end of line,
        # or a right paren.
        (?:
            (?P<rparen>\)) |
            ''' + digits + r'''
        ) (?= $ | [ \t\r\n\f)] | (?P<ws_error>))
    ''', re.VERBOSE)


RE_BINARRAY_ITEM = {
    2: _re_binarray_item(r'''(?P<digits>-?[0-1]+)'''),
    8: _re_binarray_item(r'''(?P<digits>-?[0-7]+)'''),
    10: _re_binarray_item(r'''
        (?P<digits>
            -? [1-9][0-9]* |
            -? 0
        )
    '''),
    16: _re_binarray_item(r'''(?P<digits>-?[0-9a-fA-F]+)'''),
}


[docs]class Reader:
    """A class for reading S-expressions and converting them to a node tree.
    Accepts the input line-by-line, yielding top-level S-expressions as they
    are recognized.
    """
[docs]    def __init__(self, filename):
        """Initializes internal state.  ``filename`` affects only the location
        tags that will be attached to nodes.
        """
        self.filename = filename
        self.stack = []
        self.state = State.NORMAL
        self.line = 0
        # Only valid when state is STRING.
        self.string_buffer = None
        # Only valid when state is BINARRAY.
        self.binarray_base = None
        self.binarray_data = None
        # Only valid when state is STRING or BINARRAY.
        self.binarray_width = None
        self.token_start = None

[docs]    def feed_line(self, line):
        """Feeds one line of input into the reader machine.  This method is
        a generator that yields all top-level S-expressions that have been
        recognized on this line (including multi-line expressions whose last
        character is on this line).
        """
        self.line += 1
        pos = 0
        while pos < len(line):
            loc_start = TextLocationSingle(self.filename, self.line, pos + 1)
            if self.state is State.NORMAL:
                item_re = RE_TOKEN
                thing = 'token'
            elif self.state is State.STRING:
                item_re = RE_STRING_ITEM
                thing = 'escape sequence'
            elif self.state is State.BINARRAY:
                item_re = RE_BINARRAY_ITEM[self.binarray_base]
                thing = 'binarray item'
            else:
                assert 0
            match = item_re.match(line, pos)
            if not match:
                raise ReadError(f'{loc_start}: unknown {thing}')
            pos = match.end()
            loc_end = TextLocationSingle(self.filename, self.line, pos + 1)
            loc = loc_start - loc_end
            if match['ws_error'] is not None:
                raise ReadError(f'{loc_end}: no whitespace after token')
            if self.state is State.NORMAL:
                # Normal state -- read tokens.
                if match['lparen'] is not None:
                    self.stack.append(StackEntryList(loc_start, []))
                elif match['rparen'] is not None:
                    if not self.stack:
                        raise ReadError(f'{loc}: unmatched closing paren')
                    top = self.stack.pop()
                    if not isinstance(top, StackEntryList):
                        top.raise_unclosed_error()
                    yield from self._feed_node(top.items, top.start - loc_end)
                elif match['symbol'] is not None:
                    value = Symbol(match['symbol'])
                    yield from self._feed_node(value, loc)
                elif match['sexpr_comment'] is not None:
                    self.stack.append(StackEntryComment(loc))
                elif match['bool_value'] is not None:
                    value = match['bool_value'] == '@true'
                    yield from self._feed_node(value, loc)
                elif match['nil_value'] is not None:
                    yield from self._feed_node(None, loc)
                elif match['int_or_word'] is not None:
                    if match['number'] is not None:
                        value = int(match['number'], 0)
                    elif match['raw_char'] is not None:
                        value = ord(match['raw_char'])
                    elif match['simple_escape'] is not None:
                        value = ord(ESCAPE_TO_CHAR[match['simple_escape']])
                    elif match['hex_code'] is not None:
                        value = int(match['hex_code'], 16)
                        if value not in range(0x110000):
                            raise ReadError(
                                    f'{loc}: not a valid unicode codepoint')
                    else:
                        assert 0
                    if match['word_width'] is not None:
                        width = int(match['word_width'])
                        if value < 0:
                            value += 1 << width
                        if value not in range(1 << width):
                            raise ReadError(f'{loc}: word value out of range')
                        value = BinWord(width, value)
                    yield from self._feed_node(value, loc)
                elif match['array_width'] is not None:
                    self.binarray_base = {
                        '0b': 2,
                        '0o': 8,
                        None: 10,
                        '0x': 16,
                    }[match['array_base']]
                    self.binarray_data = []
                    self.binarray_width = int(match['array_width'])
                    self.token_start = loc_start
                    self.state = State.BINARRAY
                elif match['start_quote'] is not None:
                    self.state = State.STRING
                    self.token_start = loc_start
                    self.string_buffer = StringIO()
                    if match['string_width'] is not None:
                        self.binarray_width = int(match['string_width'])
                    else:
                        self.binarray_width = None
            elif self.state is State.STRING:
                # Inside a string.
                if match['end_quote'] is not None:
                    self.state = State.NORMAL
                    value = self.string_buffer.getvalue()
                    loc = self.token_start - loc_end
                    if self.binarray_width is not None:
                        vals = [ord(x) for x in value]
                        for x in vals:
                            if x not in range(1 << self.binarray_width):
                                raise ReadError(
                                        f'{loc}: character code out of range')
                        value = BinArray(vals, width=self.binarray_width)
                    yield from self._feed_node(value, loc)
                elif match['raw_chars'] is not None:
                    self.string_buffer.write(match['raw_chars'])
                elif match['simple_escape'] is not None:
                    c = ESCAPE_TO_CHAR[match['simple_escape']]
                    self.string_buffer.write(c)
                elif match['hex_code'] is not None:
                    code = int(match['hex_code'], 16)
                    if code not in range(0x110000):
                        raise ReadError(
                                f'{loc}: not a valid unicode codepoint')
                    self.string_buffer.write(chr(code))
                else:
                    assert 0
            elif self.state is State.BINARRAY:
                # In a BinArray.
                if match['rparen'] is not None:
                    self.state = State.NORMAL
                    value = BinArray(self.binarray_data,
                                     width=self.binarray_width)
                    loc = self.token_start - loc_end
                    yield from self._feed_node(value, loc)
                elif match['digits'] is not None:
                    value = int(match['digits'], self.binarray_base)
                    if value < 0:
                        value += 1 << self.binarray_width
                    if value not in range(1 << self.binarray_width):
                        raise ReadError(f'{loc}: word value out of range')
                    self.binarray_data.append(value)
            else:
                assert 0

    def _feed_node(self, value, loc):
        """A helper method called when an S-expression has been recognized.
        Like feed_line, this is a generator that yields newly recognized
        top-level expressions.  If the reader is currently at the top level,
        simply yields the passed expression.  Otherwise, it appends it
        to whatever is currently being parsed and yields nothing.
        """
        node = GenericNode(value, loc)
        if not self.stack:
            yield node
        else:
            top = self.stack[-1]
            if isinstance(top, StackEntryList):
                top.items.append(node)
            elif isinstance(top, StackEntryComment):
                self.stack.pop()
            else:
                assert 0

[docs]    def finish(self):
        """Ensures the reader is in clean state (no unclosed S-expression
        is currently being parsed).  Should be called after the last
        ``feed_line``.
        """
        if self.state is not State.NORMAL:
            raise ReadError(f'EOF while in {self.state.name} state')
        if self.stack:
            top = self.stack[-1]
            top.raise_unclosed_error()


[docs]def read_file(file, filename='<input>'):
    """This is a generator that yields all top-level S-expression nodes from
    a given file object."""
    reader = Reader(filename)
    for line in file:
        yield from reader.feed_line(line)
    reader.finish()


[docs]def read_string(s, filename='<string>'):
    """Reads all S-expressions from a given string and returns a list
    of nodes."""
    return list(read_file(StringIO(s), filename))