stacktach-timex/timex/lexer.py

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
#
# Copyright © 2014 Rackspace Hosting.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

import logging
import sys

import ply.lex

from timex.expression import TimexLexerError


logger = logging.getLogger(__name__)


class TimexLexer(object):
    """Lexing/tokenising for time expressions"""

    def __init__(self, debug=False):
        self.debug = debug
        if not self.__doc__:
            raise TimexLexerError("Docstring information is missing. "
                                  "Timex uses PLY which requires "
                                  "docstrings for configuration.")
        self.lexer = ply.lex.lex(module=self,
                                 debug=self.debug,
                                 errorlog=logger)
        self.lexer.string_value = None
        self.latest_newline = 0

    def input(self, string):
        self.lexer.input(string)

    def token(self):
        token = self.lexer.token()
        if token is None:
            if self.lexer.string_value is not None:
                raise TimexLexerError("Unexpected EOF in expression")
        else:
            token.col = token.lexpos - self.latest_newline
        return token

    reserved_words = {
        'to': 'TO',
        'us': 'MICROSECOND',
        's': 'SECOND',
        'sec': 'SECOND',
        'm': 'MINUTE',
        'min': 'MINUTE',
        'h': 'HOUR',
        'hr': 'HOUR',
        'd': 'DAY',
        'mo': 'MONTH',
        'y': 'YEAR',
        'yr': 'YEAR',
    }

    tokens = ('NUMBER',
              'PLUS',
              'MINUS',
              'REPLACE',
              'RPAREN',
              'LPAREN',
              'VAR',
              'IDENTIFIER') + tuple(set(reserved_words.values()))

    t_PLUS = r'\+'
    t_MINUS = r'-'
    t_REPLACE = r'@'
    t_VAR = r'\$'
    t_LPAREN = r'\('
    t_RPAREN = r'\)'

    def t_IDENTIFIER(self, t):
        r'[a-zA-Z_][a-zA-Z0-9_]*'
        t.type = self.reserved_words.get(t.value, 'IDENTIFIER')
        return t

    def t_NUMBER(self, t):
        r'\d+'
        t.value = int(t.value)
        return t

    def t_newline(self, t):
        r'\n+'
        t.lexer.lineno += len(t.value)
        self.latest_newline = t.lexpos

    t_ignore = ' \t'

    def t_error(self, t):
        raise TimexLexerError('Error on line %s, col %s: Unexpected character:'
                              ' %s ' % (t.lexer.lineno,
                                        t.lexpos - self.latest_newline,
                                        t.value[0]))


if __name__ == '__main__':
    logging.basicConfig()
    lexer = TimexLexer(debug=True)
    lexer.input(sys.stdin.read())
    token = lexer.token()
    while token:
        print('%-20s%s' % (token.value, token.type))
        token = lexer.token()