deb-python-rjsmin/rjsmin.c

/*
 * Copyright 2011 - 2014
 * Andr\xe9 Malo or his licensors, as applicable
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "cext.h"
EXT_INIT_FUNC;

#define RJSMIN_DULL_BIT          (1 << 0)
#define RJSMIN_PRE_REGEX_BIT     (1 << 1)
#define RJSMIN_REGEX_DULL_BIT    (1 << 2)
#define RJSMIN_REGEX_CC_DULL_BIT (1 << 3)
#define RJSMIN_ID_LIT_BIT        (1 << 4)
#define RJSMIN_ID_LIT_O_BIT      (1 << 5)
#define RJSMIN_ID_LIT_C_BIT      (1 << 6)
#define RJSMIN_STRING_DULL_BIT   (1 << 7)
#define RJSMIN_SPACE_BIT         (1 << 8)

#ifdef EXT3
typedef Py_UNICODE rchar;
#else
typedef unsigned char rchar;
#endif
#define U(c) ((rchar)(c))

#define RJSMIN_IS_DULL(c) ((U(c) > 127) || \
    (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_DULL_BIT))

#define RJSMIN_IS_REGEX_DULL(c) ((U(c) > 127) || \
    (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_REGEX_DULL_BIT))

#define RJSMIN_IS_REGEX_CC_DULL(c) ((U(c) > 127) || \
    (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_REGEX_CC_DULL_BIT))

#define RJSMIN_IS_STRING_DULL(c) ((U(c) > 127) || \
    (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_STRING_DULL_BIT))

#define RJSMIN_IS_ID_LITERAL(c) ((U(c) > 127) || \
    (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_ID_LIT_BIT))

#define RJSMIN_IS_ID_LITERAL_OPEN(c) ((U(c) > 127) || \
    (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_ID_LIT_O_BIT))

#define RJSMIN_IS_ID_LITERAL_CLOSE(c) ((U(c) > 127) || \
    (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_ID_LIT_C_BIT))

#define RJSMIN_IS_SPACE(c) ((U(c) <= 127) && \
    (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_SPACE_BIT))

#define RJSMIN_IS_PRE_REGEX_1(c) ((U(c) <= 127) && \
    (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_PRE_REGEX_BIT))


static const unsigned short rjsmin_charmask[128] = {
    396, 396, 396, 396, 396, 396, 396, 396,
    396, 396,   2, 396, 396,   2, 396, 396,
    396, 396, 396, 396, 396, 396, 396, 396,
    396, 396, 396, 396, 396, 396, 396, 396,
    396, 175,  76, 141, 253, 141, 143,  76,
    175, 205, 141, 237, 143, 237, 141, 136,
    253, 253, 253, 253, 253, 253, 253, 253,
    253, 253, 143, 143, 141, 143, 141, 143,
    141, 253, 253, 253, 253, 253, 253, 253,
    253, 253, 253, 253, 253, 253, 253, 253,
    253, 253, 253, 253, 253, 253, 253, 253,
    253, 253, 253, 171,   1, 197, 141, 253,
    141, 253, 253, 253, 253, 253, 253, 253,
    253, 253, 253, 253, 253, 253, 253, 253,
    253, 253, 253, 253, 253, 253, 253, 253,
    253, 253, 253, 175, 143, 207, 141, 253
};

static Py_ssize_t
rjsmin(const rchar *source, rchar *target, Py_ssize_t length,
       int keep_bang_comments)
{
    const rchar *reset, *sentinel = source + length;
    rchar *tstart = target;
    rchar c, quote;

    while (source < sentinel) {
        c = *source++;
        if (RJSMIN_IS_DULL(c)) {
            *target++ = c;
            continue;
        }
        switch (c) {

        /* String */
        case U('\''): case U('"'):
            reset = source;
            *target++ = quote = c;
            while (source < sentinel) {
                c = *source++;
                *target++ = c;
                if (RJSMIN_IS_STRING_DULL(c))
                    continue;
                switch (c) {
                case U('\''): case U('"'):
                    if (c == quote)
                        goto cont;
                    continue;
                case U('\\'):
                    if (source < sentinel) {
                        c = *source++;
                        *target++ = c;
                        if (c == U('\r') && source < sentinel
                            && *source == U('\n'))
                            *target++ = *source++;
                    }
                    continue;
                }
                break;
            }
            target -= source - reset;
            source = reset;
            continue;

        /* Comment or Regex or something else entirely */
        case U('/'):
            if (!(source < sentinel)) {
                *target++ = c;
            }
            else {
                switch (*source) {
            /* Comment */
                case U('*'): case U('/'):
                    goto skip_or_copy_ws;

                default:
                    if (   target == tstart
                        || RJSMIN_IS_PRE_REGEX_1(*(target - 1))
                        || (
                            (target - tstart >= 6)
                            && *(target - 1) == U('n')
                            && *(target - 2) == U('r')
                            && *(target - 3) == U('u')
                            && *(target - 4) == U('t')
                            && *(target - 5) == U('e')
                            && *(target - 6) == U('r')
                            && (
                                   target - tstart == 6
                                || !RJSMIN_IS_ID_LITERAL(*(target - 7))
                            )
                        )) {

            /* Regex */
                        reset = source;
                        *target++ = U('/');
                        while (source < sentinel) {
                            c = *source++;
                            *target++ = c;
                            if (RJSMIN_IS_REGEX_DULL(c))
                                continue;
                            switch (c) {
                            case U('/'):
                                goto cont;
                            case U('\\'):
                                if (source < sentinel) {
                                    c = *source++;
                                    *target++ = c;
                                    if (c == U('\r') || c == U('\n'))
                                        break;
                                }
                                continue;
                            case U('['):
                                while (source < sentinel) {
                                    c = *source++;
                                    *target++ = c;
                                    if (RJSMIN_IS_REGEX_CC_DULL(c))
                                        continue;
                                    switch (c) {
                                    case U('\\'):
                                        if (source < sentinel) {
                                            c = *source++;
                                            *target++ = c;
                                            if (c == U('\r') || c == U('\n'))
                                                break;
                                        }
                                        continue;
                                    case U(']'):
                                        goto cont_regex;
                                    }
                                }
                                break;
                            }
                            break;
                        cont_regex:
                            continue;
                        }
                        target -= source - reset;
                        source = reset;
                    }
                    else {
            /* Just a slash */
                        *target++ = c;
                    }
                    continue;
                }
            }
            continue;

        /* Whitespace */
        default:
        skip_or_copy_ws:
            quote = U(' ');
            --source;
            while (source < sentinel) {
                c = *source++;
                if (RJSMIN_IS_SPACE(c))
                    continue;
                switch (c) {
                case U('\r'): case U('\n'):
                    quote = U('\n');
                    continue;
                case U('/'):
                    if (source < sentinel) {
                        switch (*source) {
                        case U('*'):
                            reset = source++;
                            /* copy bang comment, if requested */
                            if (   keep_bang_comments && source < sentinel
                                && *source == U('!')) {
                                *target++ = U('/');
                                *target++ = U('*');
                                *target++ = *source++;
                                while (source < sentinel) {
                                    c = *source++;
                                    *target++ = c;
                                    if (c == U('*') && source < sentinel
                                        && *source == U('/')) {
                                        *target++ = *source++;
                                        reset = NULL;
                                        break;
                                    }
                                }
                                if (!reset)
                                    continue;
                                target -= source - reset;
                                source = reset;
                            }
                            /* strip regular comment */
                            else {
                                while (source < sentinel) {
                                    c = *source++;
                                    if (c == U('*') && source < sentinel
                                        && *source == U('/')) {
                                        ++source;
                                        reset = NULL;
                                        break;
                                    }
                                }
                                if (!reset)
                                    continue;
                                source = reset;
                                *target++ = U('/');
                            }
                            goto cont;
                        case U('/'):
                            ++source;
                            while (source < sentinel) {
                                c = *source++;
                                switch (c) {
                                case U('\n'):
                                    break;
                                case U('\r'):
                                    if (source < sentinel
                                        && *source == U('\n'))
                                        ++source;
                                    break;
                                default:
                                    continue;
                                }
                                break;
                            }
                            quote = U('\n');
                            continue;
                        }
                    }
                }
                --source;
                break;
            }

            if ((tstart < target && source < sentinel)
                && ((quote == U('\n')
                    && RJSMIN_IS_ID_LITERAL_CLOSE(*(target - 1))
                    && RJSMIN_IS_ID_LITERAL_OPEN(*source))
                    ||
                    (quote == U(' ')
                    && ((RJSMIN_IS_ID_LITERAL(*(target - 1))
                         && RJSMIN_IS_ID_LITERAL(*source))
                        || (source < sentinel
                            && ((*(target - 1) == U('+')
                                 && *source == U('+'))
                                || (*(target - 1) == U('-')
                                    && *source == U('-'))))))))
                *target++ = quote;
        }
    cont:
        continue;
    }
    return (Py_ssize_t)(target - tstart);
}


PyDoc_STRVAR(rjsmin_jsmin__doc__,
"jsmin(script, keep_bang_comments=False)\n\
\n\
Minify javascript based on `jsmin.c by Douglas Crockford`_\\.\n\
\n\
Instead of parsing the stream char by char, it uses a regular\n\
expression approach which minifies the whole script with one big\n\
substitution regex.\n\
\n\
.. _jsmin.c by Douglas Crockford:\n\
   http://www.crockford.com/javascript/jsmin.c\n\
\n\
:Note: This is a hand crafted C implementation built on the regex\n\
       semantics.\n\
\n\
:Parameters:\n\
  `script` : ``str``\n\
    Script to minify\n\
\n\
  `keep_bang_comments` : ``bool``\n\
    Keep comments starting with an exclamation mark? (``/*!...*/``)\n\
\n\
:Return: Minified script\n\
:Rtype: ``str``");

static PyObject *
rjsmin_jsmin(PyObject *self, PyObject *args, PyObject *kwds)
{
    PyObject *script, *keep_bang_comments_ = NULL, *result;
    static char *kwlist[] = {"script", "keep_bang_comments", NULL};
    Py_ssize_t slength, length;
    int keep_bang_comments;
#ifdef EXT2
    int uni;
#define UOBJ "O"
#endif
#ifdef EXT3
#define UOBJ "U"
#endif

    if (!PyArg_ParseTupleAndKeywords(args, kwds, UOBJ "|O", kwlist,
                                     &script, &keep_bang_comments_))
        return NULL;

    if (!keep_bang_comments_)
        keep_bang_comments = 0;
    else {
        keep_bang_comments = PyObject_IsTrue(keep_bang_comments_);
        if (keep_bang_comments == -1)
            return NULL;
    }

#ifdef EXT2
    if (PyUnicode_Check(script)) {
        if (!(script = PyUnicode_AsUTF8String(script)))
            return NULL;
        uni = 1;
    }
    else {
        if (!(script = PyObject_Str(script)))
            return NULL;
        uni = 0;
    }
#endif

#ifdef EXT3
    Py_INCREF(script);
#define PyString_GET_SIZE PyUnicode_GET_SIZE
#define PyString_AS_STRING PyUnicode_AS_UNICODE
#define _PyString_Resize PyUnicode_Resize
#define PyString_FromStringAndSize PyUnicode_FromUnicode
#endif

    slength = PyString_GET_SIZE(script);
    if (!(result = PyString_FromStringAndSize(NULL, slength))) {
        Py_DECREF(script);
        return NULL;
    }
    Py_BEGIN_ALLOW_THREADS
    length = rjsmin((rchar *)PyString_AS_STRING(script),
                    (rchar *)PyString_AS_STRING(result),
                    slength, keep_bang_comments);
    Py_END_ALLOW_THREADS

    Py_DECREF(script);
    if (length < 0) {
        Py_DECREF(result);
        return NULL;
    }
    if (length != slength && _PyString_Resize(&result, length) == -1)
        return NULL;

#ifdef EXT2
    if (uni) {
        script = PyUnicode_DecodeUTF8(PyString_AS_STRING(result),
                                      PyString_GET_SIZE(result), "strict");
        Py_DECREF(result);
        if (!script)
            return NULL;
        result = script;
    }
#endif
    return result;
}

/* ------------------------ BEGIN MODULE DEFINITION ------------------------ */

EXT_METHODS = {
    {"jsmin",
        (PyCFunction)rjsmin_jsmin, METH_VARARGS | METH_KEYWORDS,
        rjsmin_jsmin__doc__},

    {NULL}  /* Sentinel */
};

PyDoc_STRVAR(EXT_DOCS_VAR,
"C implementation of rjsmin\n\
==========================\n\
\n\
C implementation of rjsmin.");


EXT_DEFINE(EXT_MODULE_NAME, EXT_METHODS_VAR, EXT_DOCS_VAR);

EXT_INIT_FUNC {
    PyObject *m;

    /* Create the module and populate stuff */
    if (!(m = EXT_CREATE(&EXT_DEFINE_VAR)))
        EXT_INIT_ERROR(NULL);

    EXT_ADD_UNICODE(m, "__author__", "Andr\xe9 Malo", "latin-1");
    EXT_ADD_STRING(m, "__docformat__", "restructuredtext en");

    EXT_INIT_RETURN(m);
}

/* ------------------------- END MODULE DEFINITION ------------------------- */