565 lines
20 KiB
JavaScript
565 lines
20 KiB
JavaScript
(function(root, factory) {
|
|
if (typeof module === 'object' && module.exports) {
|
|
module.exports = factory();
|
|
} else {
|
|
root.nearley = factory();
|
|
}
|
|
}(this, function() {
|
|
|
|
function Rule(name, symbols, postprocess) {
|
|
this.id = ++Rule.highestId;
|
|
this.name = name;
|
|
this.symbols = symbols; // a list of literal | regex class | nonterminal
|
|
this.postprocess = postprocess;
|
|
return this;
|
|
}
|
|
Rule.highestId = 0;
|
|
|
|
Rule.prototype.toString = function(withCursorAt) {
|
|
var symbolSequence = (typeof withCursorAt === "undefined")
|
|
? this.symbols.map(getSymbolShortDisplay).join(' ')
|
|
: ( this.symbols.slice(0, withCursorAt).map(getSymbolShortDisplay).join(' ')
|
|
+ " ● "
|
|
+ this.symbols.slice(withCursorAt).map(getSymbolShortDisplay).join(' ') );
|
|
return this.name + " → " + symbolSequence;
|
|
}
|
|
|
|
|
|
// a State is a rule at a position from a given starting point in the input stream (reference)
|
|
function State(rule, dot, reference, wantedBy) {
|
|
this.rule = rule;
|
|
this.dot = dot;
|
|
this.reference = reference;
|
|
this.data = [];
|
|
this.wantedBy = wantedBy;
|
|
this.isComplete = this.dot === rule.symbols.length;
|
|
}
|
|
|
|
State.prototype.toString = function() {
|
|
return "{" + this.rule.toString(this.dot) + "}, from: " + (this.reference || 0);
|
|
};
|
|
|
|
State.prototype.nextState = function(child) {
|
|
var state = new State(this.rule, this.dot + 1, this.reference, this.wantedBy);
|
|
state.left = this;
|
|
state.right = child;
|
|
if (state.isComplete) {
|
|
state.data = state.build();
|
|
// Having right set here will prevent the right state and its children
|
|
// form being garbage collected
|
|
state.right = undefined;
|
|
}
|
|
return state;
|
|
};
|
|
|
|
State.prototype.build = function() {
|
|
var children = [];
|
|
var node = this;
|
|
do {
|
|
children.push(node.right.data);
|
|
node = node.left;
|
|
} while (node.left);
|
|
children.reverse();
|
|
return children;
|
|
};
|
|
|
|
State.prototype.finish = function() {
|
|
if (this.rule.postprocess) {
|
|
this.data = this.rule.postprocess(this.data, this.reference, Parser.fail);
|
|
}
|
|
};
|
|
|
|
|
|
function Column(grammar, index) {
|
|
this.grammar = grammar;
|
|
this.index = index;
|
|
this.states = [];
|
|
this.wants = {}; // states indexed by the non-terminal they expect
|
|
this.scannable = []; // list of states that expect a token
|
|
this.completed = {}; // states that are nullable
|
|
}
|
|
|
|
|
|
Column.prototype.process = function(nextColumn) {
|
|
var states = this.states;
|
|
var wants = this.wants;
|
|
var completed = this.completed;
|
|
|
|
for (var w = 0; w < states.length; w++) { // nb. we push() during iteration
|
|
var state = states[w];
|
|
|
|
if (state.isComplete) {
|
|
state.finish();
|
|
if (state.data !== Parser.fail) {
|
|
// complete
|
|
var wantedBy = state.wantedBy;
|
|
for (var i = wantedBy.length; i--; ) { // this line is hot
|
|
var left = wantedBy[i];
|
|
this.complete(left, state);
|
|
}
|
|
|
|
// special-case nullables
|
|
if (state.reference === this.index) {
|
|
// make sure future predictors of this rule get completed.
|
|
var exp = state.rule.name;
|
|
(this.completed[exp] = this.completed[exp] || []).push(state);
|
|
}
|
|
}
|
|
|
|
} else {
|
|
// queue scannable states
|
|
var exp = state.rule.symbols[state.dot];
|
|
if (typeof exp !== 'string') {
|
|
this.scannable.push(state);
|
|
continue;
|
|
}
|
|
|
|
// predict
|
|
if (wants[exp]) {
|
|
wants[exp].push(state);
|
|
|
|
if (completed.hasOwnProperty(exp)) {
|
|
var nulls = completed[exp];
|
|
for (var i = 0; i < nulls.length; i++) {
|
|
var right = nulls[i];
|
|
this.complete(state, right);
|
|
}
|
|
}
|
|
} else {
|
|
wants[exp] = [state];
|
|
this.predict(exp);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
Column.prototype.predict = function(exp) {
|
|
var rules = this.grammar.byName[exp] || [];
|
|
|
|
for (var i = 0; i < rules.length; i++) {
|
|
var r = rules[i];
|
|
var wantedBy = this.wants[exp];
|
|
var s = new State(r, 0, this.index, wantedBy);
|
|
this.states.push(s);
|
|
}
|
|
}
|
|
|
|
Column.prototype.complete = function(left, right) {
|
|
var copy = left.nextState(right);
|
|
this.states.push(copy);
|
|
}
|
|
|
|
|
|
function Grammar(rules, start) {
|
|
this.rules = rules;
|
|
this.start = start || this.rules[0].name;
|
|
var byName = this.byName = {};
|
|
this.rules.forEach(function(rule) {
|
|
if (!byName.hasOwnProperty(rule.name)) {
|
|
byName[rule.name] = [];
|
|
}
|
|
byName[rule.name].push(rule);
|
|
});
|
|
}
|
|
|
|
// So we can allow passing (rules, start) directly to Parser for backwards compatibility
|
|
Grammar.fromCompiled = function(rules, start) {
|
|
var lexer = rules.Lexer;
|
|
if (rules.ParserStart) {
|
|
start = rules.ParserStart;
|
|
rules = rules.ParserRules;
|
|
}
|
|
var rules = rules.map(function (r) { return (new Rule(r.name, r.symbols, r.postprocess)); });
|
|
var g = new Grammar(rules, start);
|
|
g.lexer = lexer; // nb. storing lexer on Grammar is iffy, but unavoidable
|
|
return g;
|
|
}
|
|
|
|
|
|
function StreamLexer() {
|
|
this.reset("");
|
|
}
|
|
|
|
StreamLexer.prototype.reset = function(data, state) {
|
|
this.buffer = data;
|
|
this.index = 0;
|
|
this.line = state ? state.line : 1;
|
|
this.lastLineBreak = state ? -state.col : 0;
|
|
}
|
|
|
|
StreamLexer.prototype.next = function() {
|
|
if (this.index < this.buffer.length) {
|
|
var ch = this.buffer[this.index++];
|
|
if (ch === '\n') {
|
|
this.line += 1;
|
|
this.lastLineBreak = this.index;
|
|
}
|
|
return {value: ch};
|
|
}
|
|
}
|
|
|
|
StreamLexer.prototype.save = function() {
|
|
return {
|
|
line: this.line,
|
|
col: this.index - this.lastLineBreak,
|
|
}
|
|
}
|
|
|
|
StreamLexer.prototype.formatError = function(token, message) {
|
|
// nb. this gets called after consuming the offending token,
|
|
// so the culprit is index-1
|
|
var buffer = this.buffer;
|
|
if (typeof buffer === 'string') {
|
|
var lines = buffer
|
|
.split("\n")
|
|
.slice(
|
|
Math.max(0, this.line - 5),
|
|
this.line
|
|
);
|
|
|
|
var nextLineBreak = buffer.indexOf('\n', this.index);
|
|
if (nextLineBreak === -1) nextLineBreak = buffer.length;
|
|
var col = this.index - this.lastLineBreak;
|
|
var lastLineDigits = String(this.line).length;
|
|
message += " at line " + this.line + " col " + col + ":\n\n";
|
|
message += lines
|
|
.map(function(line, i) {
|
|
return pad(this.line - lines.length + i + 1, lastLineDigits) + " " + line;
|
|
}, this)
|
|
.join("\n");
|
|
message += "\n" + pad("", lastLineDigits + col) + "^\n";
|
|
return message;
|
|
} else {
|
|
return message + " at index " + (this.index - 1);
|
|
}
|
|
|
|
function pad(n, length) {
|
|
var s = String(n);
|
|
return Array(length - s.length + 1).join(" ") + s;
|
|
}
|
|
}
|
|
|
|
function Parser(rules, start, options) {
|
|
if (rules instanceof Grammar) {
|
|
var grammar = rules;
|
|
var options = start;
|
|
} else {
|
|
var grammar = Grammar.fromCompiled(rules, start);
|
|
}
|
|
this.grammar = grammar;
|
|
|
|
// Read options
|
|
this.options = {
|
|
keepHistory: false,
|
|
lexer: grammar.lexer || new StreamLexer,
|
|
};
|
|
for (var key in (options || {})) {
|
|
this.options[key] = options[key];
|
|
}
|
|
|
|
// Setup lexer
|
|
this.lexer = this.options.lexer;
|
|
this.lexerState = undefined;
|
|
|
|
// Setup a table
|
|
var column = new Column(grammar, 0);
|
|
var table = this.table = [column];
|
|
|
|
// I could be expecting anything.
|
|
column.wants[grammar.start] = [];
|
|
column.predict(grammar.start);
|
|
// TODO what if start rule is nullable?
|
|
column.process();
|
|
this.current = 0; // token index
|
|
}
|
|
|
|
// create a reserved token for indicating a parse fail
|
|
Parser.fail = {};
|
|
|
|
Parser.prototype.feed = function(chunk) {
|
|
var lexer = this.lexer;
|
|
lexer.reset(chunk, this.lexerState);
|
|
|
|
var token;
|
|
while (true) {
|
|
try {
|
|
token = lexer.next();
|
|
if (!token) {
|
|
break;
|
|
}
|
|
} catch (e) {
|
|
// Create the next column so that the error reporter
|
|
// can display the correctly predicted states.
|
|
var nextColumn = new Column(this.grammar, this.current + 1);
|
|
this.table.push(nextColumn);
|
|
var err = new Error(this.reportLexerError(e));
|
|
err.offset = this.current;
|
|
err.token = e.token;
|
|
throw err;
|
|
}
|
|
// We add new states to table[current+1]
|
|
var column = this.table[this.current];
|
|
|
|
// GC unused states
|
|
if (!this.options.keepHistory) {
|
|
delete this.table[this.current - 1];
|
|
}
|
|
|
|
var n = this.current + 1;
|
|
var nextColumn = new Column(this.grammar, n);
|
|
this.table.push(nextColumn);
|
|
|
|
// Advance all tokens that expect the symbol
|
|
var literal = token.text !== undefined ? token.text : token.value;
|
|
var value = lexer.constructor === StreamLexer ? token.value : token;
|
|
var scannable = column.scannable;
|
|
for (var w = scannable.length; w--; ) {
|
|
var state = scannable[w];
|
|
var expect = state.rule.symbols[state.dot];
|
|
// Try to consume the token
|
|
// either regex or literal
|
|
if (expect.test ? expect.test(value) :
|
|
expect.type ? expect.type === token.type
|
|
: expect.literal === literal) {
|
|
// Add it
|
|
var next = state.nextState({data: value, token: token, isToken: true, reference: n - 1});
|
|
nextColumn.states.push(next);
|
|
}
|
|
}
|
|
|
|
// Next, for each of the rules, we either
|
|
// (a) complete it, and try to see if the reference row expected that
|
|
// rule
|
|
// (b) predict the next nonterminal it expects by adding that
|
|
// nonterminal's start state
|
|
// To prevent duplication, we also keep track of rules we have already
|
|
// added
|
|
|
|
nextColumn.process();
|
|
|
|
// If needed, throw an error:
|
|
if (nextColumn.states.length === 0) {
|
|
// No states at all! This is not good.
|
|
var err = new Error(this.reportError(token));
|
|
err.offset = this.current;
|
|
err.token = token;
|
|
throw err;
|
|
}
|
|
|
|
// maybe save lexer state
|
|
if (this.options.keepHistory) {
|
|
column.lexerState = lexer.save()
|
|
}
|
|
|
|
this.current++;
|
|
}
|
|
if (column) {
|
|
this.lexerState = lexer.save()
|
|
}
|
|
|
|
// Incrementally keep track of results
|
|
this.results = this.finish();
|
|
|
|
// Allow chaining, for whatever it's worth
|
|
return this;
|
|
};
|
|
|
|
Parser.prototype.reportLexerError = function(lexerError) {
|
|
var tokenDisplay, lexerMessage;
|
|
// Planning to add a token property to moo's thrown error
|
|
// even on erroring tokens to be used in error display below
|
|
var token = lexerError.token;
|
|
if (token) {
|
|
tokenDisplay = "input " + JSON.stringify(token.text[0]) + " (lexer error)";
|
|
lexerMessage = this.lexer.formatError(token, "Syntax error");
|
|
} else {
|
|
tokenDisplay = "input (lexer error)";
|
|
lexerMessage = lexerError.message;
|
|
}
|
|
return this.reportErrorCommon(lexerMessage, tokenDisplay);
|
|
};
|
|
|
|
Parser.prototype.reportError = function(token) {
|
|
var tokenDisplay = (token.type ? token.type + " token: " : "") + JSON.stringify(token.value !== undefined ? token.value : token);
|
|
var lexerMessage = this.lexer.formatError(token, "Syntax error");
|
|
return this.reportErrorCommon(lexerMessage, tokenDisplay);
|
|
};
|
|
|
|
Parser.prototype.reportErrorCommon = function(lexerMessage, tokenDisplay) {
|
|
var lines = [];
|
|
lines.push(lexerMessage);
|
|
var lastColumnIndex = this.table.length - 2;
|
|
var lastColumn = this.table[lastColumnIndex];
|
|
var expectantStates = lastColumn.states
|
|
.filter(function(state) {
|
|
var nextSymbol = state.rule.symbols[state.dot];
|
|
return nextSymbol && typeof nextSymbol !== "string";
|
|
});
|
|
|
|
if (expectantStates.length === 0) {
|
|
lines.push('Unexpected ' + tokenDisplay + '. I did not expect any more input. Here is the state of my parse table:\n');
|
|
this.displayStateStack(lastColumn.states, lines);
|
|
} else {
|
|
lines.push('Unexpected ' + tokenDisplay + '. Instead, I was expecting to see one of the following:\n');
|
|
// Display a "state stack" for each expectant state
|
|
// - which shows you how this state came to be, step by step.
|
|
// If there is more than one derivation, we only display the first one.
|
|
var stateStacks = expectantStates
|
|
.map(function(state) {
|
|
return this.buildFirstStateStack(state, []) || [state];
|
|
}, this);
|
|
// Display each state that is expecting a terminal symbol next.
|
|
stateStacks.forEach(function(stateStack) {
|
|
var state = stateStack[0];
|
|
var nextSymbol = state.rule.symbols[state.dot];
|
|
var symbolDisplay = this.getSymbolDisplay(nextSymbol);
|
|
lines.push('A ' + symbolDisplay + ' based on:');
|
|
this.displayStateStack(stateStack, lines);
|
|
}, this);
|
|
}
|
|
lines.push("");
|
|
return lines.join("\n");
|
|
}
|
|
|
|
Parser.prototype.displayStateStack = function(stateStack, lines) {
|
|
var lastDisplay;
|
|
var sameDisplayCount = 0;
|
|
for (var j = 0; j < stateStack.length; j++) {
|
|
var state = stateStack[j];
|
|
var display = state.rule.toString(state.dot);
|
|
if (display === lastDisplay) {
|
|
sameDisplayCount++;
|
|
} else {
|
|
if (sameDisplayCount > 0) {
|
|
lines.push(' ^ ' + sameDisplayCount + ' more lines identical to this');
|
|
}
|
|
sameDisplayCount = 0;
|
|
lines.push(' ' + display);
|
|
}
|
|
lastDisplay = display;
|
|
}
|
|
};
|
|
|
|
Parser.prototype.getSymbolDisplay = function(symbol) {
|
|
return getSymbolLongDisplay(symbol);
|
|
};
|
|
|
|
/*
|
|
Builds a the first state stack. You can think of a state stack as the call stack
|
|
of the recursive-descent parser which the Nearley parse algorithm simulates.
|
|
A state stack is represented as an array of state objects. Within a
|
|
state stack, the first item of the array will be the starting
|
|
state, with each successive item in the array going further back into history.
|
|
|
|
This function needs to be given a starting state and an empty array representing
|
|
the visited states, and it returns an single state stack.
|
|
|
|
*/
|
|
Parser.prototype.buildFirstStateStack = function(state, visited) {
|
|
if (visited.indexOf(state) !== -1) {
|
|
// Found cycle, return null
|
|
// to eliminate this path from the results, because
|
|
// we don't know how to display it meaningfully
|
|
return null;
|
|
}
|
|
if (state.wantedBy.length === 0) {
|
|
return [state];
|
|
}
|
|
var prevState = state.wantedBy[0];
|
|
var childVisited = [state].concat(visited);
|
|
var childResult = this.buildFirstStateStack(prevState, childVisited);
|
|
if (childResult === null) {
|
|
return null;
|
|
}
|
|
return [state].concat(childResult);
|
|
};
|
|
|
|
Parser.prototype.save = function() {
|
|
var column = this.table[this.current];
|
|
column.lexerState = this.lexerState;
|
|
return column;
|
|
};
|
|
|
|
Parser.prototype.restore = function(column) {
|
|
var index = column.index;
|
|
this.current = index;
|
|
this.table[index] = column;
|
|
this.table.splice(index + 1);
|
|
this.lexerState = column.lexerState;
|
|
|
|
// Incrementally keep track of results
|
|
this.results = this.finish();
|
|
};
|
|
|
|
// nb. deprecated: use save/restore instead!
|
|
Parser.prototype.rewind = function(index) {
|
|
if (!this.options.keepHistory) {
|
|
throw new Error('set option `keepHistory` to enable rewinding')
|
|
}
|
|
// nb. recall column (table) indicies fall between token indicies.
|
|
// col 0 -- token 0 -- col 1
|
|
this.restore(this.table[index]);
|
|
};
|
|
|
|
Parser.prototype.finish = function() {
|
|
// Return the possible parsings
|
|
var considerations = [];
|
|
var start = this.grammar.start;
|
|
var column = this.table[this.table.length - 1]
|
|
column.states.forEach(function (t) {
|
|
if (t.rule.name === start
|
|
&& t.dot === t.rule.symbols.length
|
|
&& t.reference === 0
|
|
&& t.data !== Parser.fail) {
|
|
considerations.push(t);
|
|
}
|
|
});
|
|
return considerations.map(function(c) {return c.data; });
|
|
};
|
|
|
|
function getSymbolLongDisplay(symbol) {
|
|
var type = typeof symbol;
|
|
if (type === "string") {
|
|
return symbol;
|
|
} else if (type === "object") {
|
|
if (symbol.literal) {
|
|
return JSON.stringify(symbol.literal);
|
|
} else if (symbol instanceof RegExp) {
|
|
return 'character matching ' + symbol;
|
|
} else if (symbol.type) {
|
|
return symbol.type + ' token';
|
|
} else if (symbol.test) {
|
|
return 'token matching ' + String(symbol.test);
|
|
} else {
|
|
throw new Error('Unknown symbol type: ' + symbol);
|
|
}
|
|
}
|
|
}
|
|
|
|
function getSymbolShortDisplay(symbol) {
|
|
var type = typeof symbol;
|
|
if (type === "string") {
|
|
return symbol;
|
|
} else if (type === "object") {
|
|
if (symbol.literal) {
|
|
return JSON.stringify(symbol.literal);
|
|
} else if (symbol instanceof RegExp) {
|
|
return symbol.toString();
|
|
} else if (symbol.type) {
|
|
return '%' + symbol.type;
|
|
} else if (symbol.test) {
|
|
return '<' + String(symbol.test) + '>';
|
|
} else {
|
|
throw new Error('Unknown symbol type: ' + symbol);
|
|
}
|
|
}
|
|
}
|
|
|
|
return {
|
|
Parser: Parser,
|
|
Grammar: Grammar,
|
|
Rule: Rule,
|
|
};
|
|
|
|
}));
|