JS token regexes

2011-08-22

Sorry, no description for now. Just tracking these for myself for now...

Note (at least in js) newlines are not captured by . (the "any" character), you need to explicitly search for \n or even \s if your regex permits it. Some of these patterns depend on this.

Code:
// whitespace (missing some unicodes?)
/[ \t\u000B\u000C\u00A0\uFFFF]/

// line terminator (incomplete, missing some exotic unicodes)
/[\u000A\u000D\u2028\u2029]/

// multi line comment
/(?:\/\*[^*]*?\*\/)|(?:\/\*.*$)/

// single line comment
/\/\/.*/

// number
/(?:(0[xX][0-9A-Fa-f]+)|((?:(?:(?:(?:[0-9]+)(?:\.[0-9]*)?))|(?:\.[0-9]+))(?:[eE][-+]?[0-9]{1,})?))/

// ascii identifier (the simple version)
/[a-zA-Z0-9\$_]/

// punctuators
/>>>=|===|!==|>>>|<<=|>>=|<=|>=|==|!=|\+\+|--|<<|>>|&&|\|\||\+=|-=|\*=|%=|\&=|\|=|\^=|\/=|[{}()[\].;,<>+-*%|^!~?:=/]|\//


Here's the full commented source, I'm not using it anymore:

Code:
var getIdentifierRegex = function(testing){
var chars = [
// a letter or a digit
// identifiers cannot start with a digit, this regex does NOT check for that
'[\\w\\d$]', // \w includes underscores (_)
// a full unicode escape (legal in identifiers)
'(?:\\\\u(?:[\\da-f]){4})',
];

var atom = chars.map(function(c){
return '(?:'+c+')';
}).join('|');

var body = '(?:'+atom+')+';

return new RegExp(body, 'ig');
};

function getNumberRegex(testing){

var wrap = function(s){ return '(?:'+s+')'; };

var hex = '0x[\\da-f]+';
var integer = '\\d';
var dot = '\\.';

// e is a suffix, can be followed by positive or negative sign,
// which must be followed by one or more digits, even zeroes
var exp = 'e[-+]?\\d+';

// body = hex | ((( 0 | [1-9]int* )( . int* )? | .int+)(exp)?)
var body = wrap(
wrap(hex)+
'|'+
wrap(
wrap( // either digits[.[digits]] or .digits
wrap( // integer with optional fraction
wrap( // if the first digit is zero, no more will follow
'0'+
'|'+
'[1-9]'+integer+'*'
)+
wrap( // optional dot with optional fraction (if fraction, dot is required)
dot+integer+'*'
)+'?'
)+
'|'+
wrap( // leading dot with required fraction part
dot+
integer+'+'
)
)+
wrap(exp)+'?'
)
);

if (!testing) return new RegExp(body, 'gi');

// test cases...

var regex = new RegExp('^'+body+'$', 'i');


var good = [
'25',
'0',
'0.1234',
'.1234',
'.00',
'.0',
'0.',
'500.',
'1e2',
'1e15',
'1e05',
'1e41321',
'1e-0',
'1e+0',
'0.e15',
'0.e-15',
'0.e+15',
'.0e15',
'.0e+15',
'.0e-15',
'.0e-0',
'0.15e+125',
'0x15',
'0x0',
'0xdeadbeefcace',
'0X500dead',
];
var bad = [
'00',
'00.',
'00.0',
'.e',
'.e5',
'.e+15',
'15e+',
'15e-',
'.15e+',
'.15e-',
'00xfeed',
'00Xfeed',
];

console.log("the goods:");
good.forEach(function(str){
if (regex.test(str)) console.log('okay: '+str);
else console.warn('fail: '+str);
});
console.log("the bads:");
bad.forEach(function(str){
if (regex.test(str)) console.warn('fail: '+str);
else console.log('okay: '+str);
});
};

// this builds the regex that determines which token to parse next
// it should be applied to the entire input with a proper offset
// set to lastIndex (which makes the regex start there, because it
// has the global flag set).

var getLastindexStartRegex = function(){
// note: punctuators should be parsed long to short. regex picks longest first, parser wants that too.
var punc = [
'>>>=',
'===','!==','>>>','<<=','>>=',
'<=','>=','==','!=','\\+\\+','--','<<','>>','\\&\\&','\\|\\|','\\+=','-=','\\*=','%=','\\&=','\\|=','\\^=','\\/=',
'\\{','\\}','\\(','\\)','\\[','\\]','\\.',';',',','<','>','\\+','-','\\*','%','\\|','\\&','\\|','\\^','!','~','\\?',':','=','\\/'
];

// everything is wrapped in (<start>)?
var starts = [
'[ \\t\\u000B\\u000C\\u00A0\\uFFFF]', // whitespace
'[\\u000A\\u000D\\u2028\\u2029]', // lineterminators
'\\/\\/', // single comment
'\\/\\*', // multi comment
'\'', // single string
'"', // double string
'\\.?[0-9]', // numbers
'?:(\\/)[^=]', // regex
punc.join('|')
];

// basic structure: /^(token)?(token)?(token)?.../
// match need to start left but might not match entire input part
var s = '/' + starts.map(function(start){ return '('+start+')?'; }).join('') + '/';

// this regex, when applied, returns either null (=error) or a match for exactly one of the starts
return s;
};

// this builds the regex that determines which token to parse next
// it should be applied to the next four bytes of the input (four
// bytes because of the longest punctuator).

var getSubstringStartRegex = function(testing){
// note: punctuators should be parsed long to short. regex picks longest first, parser wants that too.
var punc = [
'>>>=',
'===','!==','>>>','<<=','>>=',
'<=','>=','==','!=','\\+\\+','--','<<','>>','\\&\\&','\\|\\|','\\+=','-=','\\*=','%=','\\&=','\\|=','\\^=','\\/=',
'\\{','\\}','\\(','\\)','\\[','\\]','\\.',';',',','<','>','\\+','-','\\*','%','\\|','\\&','\\|','\\^','!','~','\\?',':','=','\\/'
];


// everything is wrapped in (<start>)?
var starts = [
'[\\u0009\\u000B\\u000C\\u0020\\u00A0\\uFFFF]', // whitespace: http://es5.github.com/#WhiteSpace
'[\\u000A\\u000D\\u2028\\u2029]', // lineterminators: http://es5.github.com/#LineTerminator
'\\/\\/', // single comment
'\\/\\*', // multi comment
'\'', // single string
'"', // double string
'\\.?[0-9]', // numbers
'\\/=?', // regex
punc.join('|')
];

// basic structure: /^(token)?(token)?(token)?.../
// match need to start left but might not match entire input part
var s = '^' + starts.map(function(start){ return '('+start+')?'; }).join('') + (testing?'$':'');

return new RegExp(s);
};

var getStringBodyRegex = function(quote, testing){
// unicode hex escape+any-char non-newline-char
var parts = [
'\\\\u[\\da-f]{4}', // unicode escape, \u1234
'\\\\x[\\da-f]{2}', // hex escape, \x12
'\\\\(?:(?:\\u000D\\u000A)|[\\u000A\\u000D\\u2028\\u2029])',
'\\\\[^xu]', // single char escape, but dont allow u or x here
'[^\\n\\\\'+quote+']' // anything but a newline,backslash or target quote (we want to fail malformed \x and \u)
];
var body = parts.map(function(part){
return '(?:'+part+')';
}).join('|');

var regex = quote+'(?:'+body+')*'+quote;
return new RegExp(regex, 'img');

if (!testing) return regex;

// rest is just for running tests...

// slightly different regex for testing (i want it to match exactly the whole test case)
var regex = '^(?:'+body+')*'+quote+'$';
regex = new RegExp(regex, 'im');

var good = [
'foo',
'foo\\s',
'\\sfoo',
'foo\\sbar',
'foo\\\\n',
'\\\\nfoo',
'foo\\\\nbar',
'foo\\\n',
'\\\nfoo',
'foo\\\nbar',
'foo\\u1234',
'\\u0badfoo',
'foo\\udeadbar',
'foo\\x15',
'\xabfoo',
'foo\x10bar'
];
var bad = [
'foo',
'\n',
'foo\n',
'\nfoo',
'\\u123h'+quote,
'\\u123'+quote,
];

console.log("the goods:");
good.forEach(function(str){
if (regex.test(str+'"')) console.log('okay: '+str+'"');
else console.warn('fail: '+str+'"');
});
console.log("the bads:");
bad.forEach(function(str){
if (regex.test(str)) console.warn('fail: '+str);
else console.log('okay: '+str);
});
};