pvdz.ee - JS token regexes

Sorry, no description for now. Just tracking these for myself for now...

Note (at least in js) newlines are not captured by . (the "any" character), you need to explicitly search for \n or even \s if your regex permits it. Some of these patterns depend on this.

Code:

// whitespace (missing some unicodes?)
/[ \t\u000B\u000C\u00A0\uFFFF]/

// line terminator (incomplete, missing some exotic unicodes)
/[\u000A\u000D\u2028\u2029]/

// multi line comment
/(?:\/\*[^*]*?\*\/)|(?:\/\*.*$)/

// single line comment
/\/\/.*/

// number
/(?:(0[xX][0-9A-Fa-f]+)|((?:(?:(?:(?:[0-9]+)(?:\.[0-9]*)?))|(?:\.[0-9]+))(?:[eE][-+]?[0-9]{1,})?))/

// ascii identifier (the simple version)
/[a-zA-Z0-9\$_]/

// punctuators
/>>>=|===|!==|>>>|<<=|>>=|<=|>=|==|!=|\+\+|--|<<|>>|&&|\|\||\+=|-=|\*=|%=|\&=|\|=|\^=|\/=|[{}()[\].;,<>+-*%|^!~?:=/]|\//

Here's the full commented source, I'm not using it anymore:

Code:

var getIdentifierRegex = function(testing){
    var chars = [
        // a letter or a digit
        // identifiers cannot start with a digit, this regex does NOT check for that
        '[\\w\\d$]', // \w includes underscores (_)
        // a full unicode escape (legal in identifiers)
        '(?:\\\\u(?:[\\da-f]){4})',
    ];

    var atom = chars.map(function(c){
        return '(?:'+c+')';
    }).join('|');

    var body = '(?:'+atom+')+';

    return new RegExp(body, 'ig');
};

function getNumberRegex(testing){

    var wrap = function(s){ return '(?:'+s+')'; };

    var hex = '0x[\\da-f]+';
    var integer = '\\d';
    var dot = '\\.';

    // e is a suffix, can be followed by positive or negative sign,
    // which must be followed by one or more digits, even zeroes
    var exp = 'e[-+]?\\d+';

    // body = hex | ((( 0 | [1-9]int* )( . int* )? | .int+)(exp)?)
    var body = wrap(
        wrap(hex)+
        '|'+
        wrap(
            wrap( // either digits[.[digits]] or .digits
                wrap( // integer with optional fraction
                    wrap( // if the first digit is zero, no more will follow
                        '0'+
                        '|'+
                        '[1-9]'+integer+'*'
                    )+
                    wrap( // optional dot with optional fraction (if fraction, dot is required)
                        dot+integer+'*'
                    )+'?'
                )+
                '|'+
                wrap( // leading dot with required fraction part
                    dot+
                    integer+'+'
                )
            )+
            wrap(exp)+'?'
        )
    );

    if (!testing) return new RegExp(body, 'gi');

    // test cases...

    var regex = new RegExp('^'+body+'$', 'i');


    var good = [
        '25',
        '0',
        '0.1234',
        '.1234',
        '.00',
        '.0',
        '0.',
        '500.',
        '1e2',
        '1e15',
        '1e05',
        '1e41321',
        '1e-0',
        '1e+0',
        '0.e15',
        '0.e-15',
        '0.e+15',
        '.0e15',
        '.0e+15',
        '.0e-15',
        '.0e-0',
        '0.15e+125',
        '0x15',
        '0x0',
        '0xdeadbeefcace',
        '0X500dead',
    ];
    var bad = [
        '00',
        '00.',
        '00.0',
        '.e',
        '.e5',
        '.e+15',
        '15e+',
        '15e-',
        '.15e+',
        '.15e-',
        '00xfeed',
        '00Xfeed',
    ];

    console.log("the goods:");
    good.forEach(function(str){
        if (regex.test(str)) console.log('okay: '+str);
        else console.warn('fail: '+str);
    });
    console.log("the bads:");
    bad.forEach(function(str){
        if (regex.test(str)) console.warn('fail: '+str);
        else console.log('okay: '+str);
    });
};

// this builds the regex that determines which token to parse next
// it should be applied to the entire input with a proper offset
// set to lastIndex (which makes the regex start there, because it
// has the global flag set).

var getLastindexStartRegex = function(){
    // note: punctuators should be parsed long to short. regex picks longest first, parser wants that too.
    var punc = [
        '>>>=',
        '===','!==','>>>','<<=','>>=',
        '<=','>=','==','!=','\\+\\+','--','<<','>>','\\&\\&','\\|\\|','\\+=','-=','\\*=','%=','\\&=','\\|=','\\^=','\\/=',
        '\\{','\\}','\\(','\\)','\\[','\\]','\\.',';',',','<','>','\\+','-','\\*','%','\\|','\\&','\\|','\\^','!','~','\\?',':','=','\\/'
    ];

    // everything is wrapped in (<start>)?
    var starts = [
        '[ \\t\\u000B\\u000C\\u00A0\\uFFFF]', // whitespace
        '[\\u000A\\u000D\\u2028\\u2029]', // lineterminators
        '\\/\\/', // single comment
        '\\/\\*', // multi comment
        '\'', // single string
        '"', // double string
        '\\.?[0-9]', // numbers
        '?:(\\/)[^=]', // regex
        punc.join('|')
    ];

    // basic structure: /^(token)?(token)?(token)?.../
    // match need to start left but might not match entire input part
    var s = '/' + starts.map(function(start){ return '('+start+')?'; }).join('') + '/';

    // this regex, when applied, returns either null (=error) or a match for exactly one of the starts
    return s;
};

// this builds the regex that determines which token to parse next
// it should be applied to the next four bytes of the input (four
// bytes because of the longest punctuator).

var getSubstringStartRegex = function(testing){
    // note: punctuators should be parsed long to short. regex picks longest first, parser wants that too.
    var punc = [
        '>>>=',
        '===','!==','>>>','<<=','>>=',
        '<=','>=','==','!=','\\+\\+','--','<<','>>','\\&\\&','\\|\\|','\\+=','-=','\\*=','%=','\\&=','\\|=','\\^=','\\/=',
        '\\{','\\}','\\(','\\)','\\[','\\]','\\.',';',',','<','>','\\+','-','\\*','%','\\|','\\&','\\|','\\^','!','~','\\?',':','=','\\/'
    ];


    // everything is wrapped in (<start>)?
    var starts = [
        '[\\u0009\\u000B\\u000C\\u0020\\u00A0\\uFFFF]', // whitespace: http://es5.github.com/#WhiteSpace
        '[\\u000A\\u000D\\u2028\\u2029]', // lineterminators: http://es5.github.com/#LineTerminator
        '\\/\\/', // single comment
        '\\/\\*', // multi comment
        '\'', // single string
        '"', // double string
        '\\.?[0-9]', // numbers
        '\\/=?', // regex
        punc.join('|')
    ];

    // basic structure: /^(token)?(token)?(token)?.../
    // match need to start left but might not match entire input part
    var s = '^' + starts.map(function(start){ return '('+start+')?'; }).join('') + (testing?'$':'');

    return new RegExp(s);
};

var getStringBodyRegex = function(quote, testing){
    // unicode hex escape+any-char non-newline-char
    var parts = [
        '\\\\u[\\da-f]{4}', // unicode escape, \u1234
        '\\\\x[\\da-f]{2}', // hex escape, \x12
        '\\\\(?:(?:\\u000D\\u000A)|[\\u000A\\u000D\\u2028\\u2029])',
        '\\\\[^xu]', // single char escape, but dont allow u or x here
        '[^\\n\\\\'+quote+']' // anything but a newline,backslash or target quote (we want to fail malformed \x and \u)
    ];
    var body = parts.map(function(part){
        return '(?:'+part+')';
    }).join('|');

    var regex = quote+'(?:'+body+')*'+quote;
    return new RegExp(regex, 'img');

    if (!testing) return regex;

    // rest is just for running tests...

    // slightly different regex for testing (i want it to match exactly the whole test case)
    var regex = '^(?:'+body+')*'+quote+'$';
    regex = new RegExp(regex, 'im');

    var good = [
        'foo',
        'foo\\s',
        '\\sfoo',
        'foo\\sbar',
        'foo\\\\n',
        '\\\\nfoo',
        'foo\\\\nbar',
        'foo\\\n',
        '\\\nfoo',
        'foo\\\nbar',
        'foo\\u1234',
        '\\u0badfoo',
        'foo\\udeadbar',
        'foo\\x15',
        '\xabfoo',
        'foo\x10bar'
    ];
    var bad = [
        'foo',
        '\n',
        'foo\n',
        '\nfoo',
        '\\u123h'+quote,
        '\\u123'+quote,
    ];

    console.log("the goods:");
    good.forEach(function(str){
        if (regex.test(str+'"')) console.log('okay: '+str+'"');
        else console.warn('fail: '+str+'"');
    });
    console.log("the bads:");
    bad.forEach(function(str){
        if (regex.test(str)) console.warn('fail: '+str);
        else console.log('okay: '+str);
    });
};