In our previous article, we explored the Abstract Syntax Tree (AST) structure and the BNF grammar that defines our SQL language. Now we’ll build the parser that transforms token streams into these AST structures using recursive descent parsing.
The parser is the bridge between the flat token sequence from our tokenizer and the hierarchical AST representation that enables semantic analysis and execution.
Parser Architecture
Our parser follows the recursive descent pattern, where each grammar rule maps to a parsing method:
private List<ColumnDefinition> ParseColumnDefinition() { var list = new List<ColumnDefinition>(); while (true) { // column name if (currentToken?.Type != TokenType.ID) { thrownew Exception("Column definition must start with column name"); } string columnName = currentToken.Lexeme; NextToken();
// column data type if (currentToken?.Type isnot (TokenType.INT or TokenType.FLOAT or TokenType.VARCHAR or TokenType.BOOL)) { thrownew Exception("Invalid data type for column"); } ColumnType type = currentToken?.Type switch { TokenType.INT => ColumnType.Int, TokenType.FLOAT => ColumnType.Float, TokenType.VARCHAR => ColumnType.String, TokenType.BOOL => ColumnType.Bool, _ => thrownew Exception($"Unsupported column type: {currentToken?.Type}") }; NextToken(); // Handle VARCHAR length int? len = null; if (type == ColumnType.String && currentToken?.Type == TokenType.L_BRACKET) { NextToken(); if (currentToken?.Type != TokenType.INT) thrownew Exception("VARCHAR type with length must be an integer"); len = int.TryParse(currentToken.Lexeme, outint v) ? v : 0; NextToken(); if (currentToken?.Type != TokenType.R_BRACKET) thrownew Exception("Missing ')' after VARCHAR length"); NextToken(); }
// Parse constraints var listCC = new List<ColumnConstraint>(); while (currentToken?.Type is TokenType.PRIMARY or TokenType.NOT or TokenType.UNIQUE or TokenType.DEFAULT) { var constraint = new ColumnConstraint(); switch (currentToken.Type) { case TokenType.PRIMARY: NextToken(); if (currentToken?.Type != TokenType.KEY) thrownew Exception("PRIMARY must be followed by KEY"); constraint.Type = ColumnConstraintType.PrimaryKey; NextToken(); break; case TokenType.NOT: NextToken(); if (currentToken?.Type != TokenType.NULL) thrownew Exception("NOT must be followed by NULL"); constraint.Type = ColumnConstraintType.NotNull; NextToken(); break; case TokenType.UNIQUE: constraint.Type = ColumnConstraintType.Unique; NextToken(); break; case TokenType.DEFAULT: constraint.Type = ColumnConstraintType.Default; NextToken(); constraint.Value = currentToken?.Lexeme; NextToken(); break; } listCC.Add(constraint); } list.Add(new ColumnDefinition(columnName, type, len, listCC)); if (currentToken?.Type == TokenType.COMMA) { NextToken(); continue; } break; } return list; }
// Optional column list if (currentToken?.Type == TokenType.L_BRACKET) { NextToken(); while (true) { if (currentToken?.Type != TokenType.ID) { thrownew Exception("Missing column name in INSERT statement"); } node.ColumnNames.Add(currentToken.Lexeme); NextToken(); if (currentToken?.Type == TokenType.COMMA) { NextToken(); continue; } break; } if (currentToken?.Type != TokenType.R_BRACKET) { thrownew Exception("Missing ')' in INSERT statement columns"); } NextToken(); }
if (currentToken?.Type != TokenType.VALUES) { thrownew Exception("Missing 'VALUES' in INSERT statement"); } NextToken(); // Parse values while (true) { if (currentToken?.Type != TokenType.L_BRACKET) { thrownew Exception("Missing '(' in INSERT statement values"); } NextToken(); var valuesList = new List<SqlValue>(); while (true) { if (currentToken?.Type isnot (TokenType.INT or TokenType.FLOAT or TokenType.NULL or TokenType.TRUE or TokenType.FALSE or TokenType.STRING_LITERAL)) { thrownew Exception("Invalid value type"); } valueList.Add(currentToken?.Type switch { TokenType.INT => new SqlValue(ValueType.Int, int.TryParse(currentToken.Lexeme, outint v) ? v : 0), TokenType.FLOAT => new SqlValue(ValueType.Float, double.TryParse(currentToken.Lexeme, outdouble d) ? d : 0.0), TokenType.TRUE => new SqlValue(ValueType.True, true), TokenType.FALSE => new SqlValue(ValueType.False, false), TokenType.STRING_LITERAL => new SqlValue(ValueType.String, currentToken.Lexeme), TokenType.NULL => new SqlValue(ValueType.Null, null), _ => thrownew Exception("Unknown value type") }); NextToken();
if (currentToken?.Type == TokenType.COMMA) { NextToken(); continue; } break; } node.Values.Add(valuesList); if (currentToken?.Type != TokenType.R_BRACKET) { thrownew Exception("Missing ')' in INSERT statement values"); } NextToken(); if (currentToken?.Type == TokenType.COMMA) { NextToken(); continue; } break; }
private UpdateNode ParseUpdateStatement() { UpdateNode node = new UpdateNode(); if (currentToken?.Type != TokenType.UPDATE) { thrownew Exception("UPDATE statement must start with UPDATE"); } NextToken();
if (currentToken?.Type != TokenType.ID) { thrownew Exception("UPDATE statement must have a table name"); } node.TableName = currentToken.Lexeme; NextToken();
if (currentToken?.Type != TokenType.SET) { thrownew Exception("UPDATE statement must be followed by 'SET'"); } NextToken();
while (true) { if (currentToken?.Type != TokenType.ID) { thrownew Exception("UPDATE statement assignment must start with column name"); } string columnName = currentToken.Lexeme; NextToken();
if (currentToken?.Type != TokenType.EQUAL) { thrownew Exception("UPDATE statement assignment must use '=' operator"); } NextToken();
private SelectNode ParseSelectStatement() { var node = new SelectNode();
if (currentToken?.Type != TokenType.SELECT) { thrownew Exception("SELECT statement must start with SELECT"); } NextToken();
node.SelectList = ParseSelectList();
if (currentToken?.Type != TokenType.FROM) { thrownew Exception("Missing FROM clause in SELECT statement"); } NextToken(); node.TableNamesWithAlias = ParseTableList();
if (currentToken?.Type == TokenType.WHERE) { NextToken(); node.WhereClause = ParseExpression(); }
if (currentToken?.Type == TokenType.GROUP) { NextToken(); if (currentToken?.Type != TokenType.BY) { thrownew Exception("GROUP must be followed by BY"); } NextToken(); node.GroupByColumns = ParseGroupByColumns(); }
if (currentToken?.Type == TokenType.ORDER) { NextToken(); if (currentToken?.Type != TokenType.BY) { thrownew Exception("ORDER must be followed by BY"); } NextToken(); node.OrderItems = ParseOrderBy(); } return node; }
Grammar mapping:
1 2 3 4 5 6 7 8 9 10 11
<SelectStatement> ::= SELECT <SelectList> FROM <TableList> [WHERE <Expression>] [GROUP BY <ColumnList>] [ORDER BY <OrderList>] [SEMICOLON]
private List<SelectItem> ParseSelectList() { var list = new List<SelectItem>(); while (true) { if (currentToken?.Type == TokenType.ASTERISK) { list.Add(new SelectItem(true, new StarExpression(null), null)); NextToken(); } else { // Parse complete expression var expr = ParseExpression(); string? aliasName = null; // Check for AS alias if (currentToken?.Type == TokenType.AS) { NextToken(); if (currentToken?.Type != TokenType.ID) { thrownew Exception("AS must be followed by an identifier"); } aliasName = currentToken.Lexeme; NextToken(); } // Check if expression is wildcard * bool isWildcard = expr is StarExpression; list.Add(new SelectItem(isWildcard, expr, aliasName)); } if (currentToken?.Type == TokenType.COMMA) { NextToken(); continue; } break; } return list; }
Parsing flow:
Loop through select items:
If current token is *, create SelectItem with isStar=true
Otherwise, call ParseExpression() to parse expression
private List<(string, string)> ParseTableList() { var list = new List<(string, string)>(); while (true) { if (currentToken?.Type != TokenType.ID) { thrownew Exception("FROM must be followed by a table name identifier"); } string tableName = currentToken.Lexeme, aliasName = ""; NextToken(); if (currentToken?.Type == TokenType.ID) { aliasName = currentToken.Lexeme; NextToken(); } list.Add((tableName, aliasName)); if (currentToken?.Type == TokenType.COMMA) { NextToken(); continue; } break; } return list; }
Parsing flow:
Loop through tables:
Expect ID token, extract table name, consume token
private Expression ParseOr() { var left = ParseAnd(); while (currentToken?.Type == TokenType.OR) { NextToken(); var right = ParseAnd(); left = new BinaryExpression(left, BinaryOperatorType.Or, right); } return left; }
private Expression ParseAnd() { var left = ParseComparison(); while (currentToken?.Type == TokenType.AND) { NextToken(); var right = ParseComparison(); left = new BinaryExpression(left, BinaryOperatorType.And, right); } return left; }
private Expression ParseComparison() { var left = ParseAddSub(); // Handle BETWEEN operator if (currentToken?.Type == TokenType.BETWEEN) { NextToken(); var lowerBound = ParseAddSub(); if (currentToken?.Type != TokenType.AND) { thrownew Exception("BETWEEN must be followed by AND"); } NextToken(); var upperBound = ParseAddSub(); returnnew BetweenExpression(left, lowerBound, upperBound); } // Handle IN operator if (currentToken?.Type == TokenType.IN) { NextToken(); if (currentToken?.Type != TokenType.L_BRACKET) { thrownew Exception("IN must be followed by '('"); } NextToken(); var values = new List<Expression>(); // Check if it's a subquery if (currentToken?.Type == TokenType.SELECT) { var subquery = ParseSelectStatement(); values.Add(new SubqueryExpression(subquery)); } else { // Parse value list while (true) { values.Add(ParseExpression()); if (currentToken?.Type == TokenType.COMMA) { NextToken(); continue; } break; } } if (currentToken?.Type != TokenType.R_BRACKET) { thrownew Exception("Missing ')' in IN expression"); } NextToken(); returnnew InExpression(left, values); } // Handle regular comparison operators while (currentToken?.Type == TokenType.EQUAL || currentToken?.Type == TokenType.GREATER_THAN || currentToken?.Type == TokenType.LESS_THAN || currentToken?.Type == TokenType.GREATER_EQUAL_TO || currentToken?.Type == TokenType.LESS_EQUAL_TO) { var op = currentToken?.Type switch { TokenType.EQUAL => BinaryOperatorType.Equal, TokenType.LESS_EQUAL_TO => BinaryOperatorType.LessOrEqual, TokenType.LESS_THAN => BinaryOperatorType.LessThan, TokenType.GREATER_EQUAL_TO => BinaryOperatorType.GreaterOrEqual, TokenType.GREATER_THAN => BinaryOperatorType.GreaterThan, _ => thrownew Exception("Unknown comparison operator") }; NextToken(); var right = ParseAddSub(); left = new BinaryExpression(left, op, right); } return left; }
private Expression ParseAddSub() { var left = ParseMulDiv(); while (currentToken?.Type == TokenType.PLUS || currentToken?.Type == TokenType.MINUS) { var op = currentToken?.Type == TokenType.PLUS ? BinaryOperatorType.Add : BinaryOperatorType.Subtract; NextToken(); var right = ParseMulDiv(); left = new BinaryExpression(left, op, right); } return left; }
private Expression ParseMulDiv() { var left = ParsePrimary(); while (currentToken?.Type == TokenType.ASTERISK || currentToken?.Type == TokenType.DIVISION) { var op = currentToken?.Type == TokenType.ASTERISK ? BinaryOperatorType.Multiply : BinaryOperatorType.Divide; NextToken(); var right = ParsePrimary(); left = new BinaryExpression(left, op, right); } return left; }
private Expression ParsePrimary() { // Parenthesized expression or subquery if (currentToken?.Type == TokenType.L_BRACKET) { NextToken(); if (currentToken?.Type == TokenType.SELECT) { var subquery = ParseSelectStatement(); if (currentToken?.Type != TokenType.R_BRACKET) { thrownew Exception("Missing ')' for subquery"); } NextToken(); returnnew SubqueryExpression(subquery); } var expr = ParseExpression(); if (currentToken?.Type != TokenType.R_BRACKET) { thrownew Exception("Missing ')'"); } NextToken(); return expr; }
// Unary expressions if (currentToken?.Type == TokenType.NOT) { NextToken(); var expr = ParsePrimary(); returnnew UnaryExpression(UnaryOperatorType.Not, expr); } if (currentToken?.Type == TokenType.MINUS) { NextToken(); var expr = ParsePrimary(); returnnew UnaryExpression(UnaryOperatorType.Minus, expr); } if (currentToken?.Type == TokenType.PLUS) { NextToken(); var expr = ParsePrimary(); returnnew UnaryExpression(UnaryOperatorType.Plus, expr); }
// Function calls and column references if (currentToken?.Type == TokenType.ID || currentToken?.Type == TokenType.COUNT || currentToken?.Type == TokenType.SUM || currentToken?.Type == TokenType.AVG || currentToken?.Type == TokenType.MIN || currentToken?.Type == TokenType.MAX) { string name = currentToken.Lexeme; TokenType currentTokenType = currentToken.Type;
NextToken(); var arguments = new List<Expression>(); // Function call if (currentToken?.Type == TokenType.L_BRACKET) { NextToken(); // consume L_BRACKET while (true) { // Handle * as special case for functions like COUNT(*) if (currentToken?.Type == TokenType.ASTERISK) { arguments.Add(new StarExpression(null)); NextToken(); } else { arguments.Add(ParseExpression()); } if (currentToken?.Type == TokenType.COMMA) { NextToken(); continue; } break; } if (currentToken?.Type != TokenType.R_BRACKET) { thrownew Exception("Missing ')' for function call"); } NextToken(); FunctionName funcName = name switch { "sum" => FunctionName.Sum, "avg" => FunctionName.Avg, "count" => FunctionName.Count, "min" => FunctionName.Min, "max" => FunctionName.Max, _ => thrownew Exception("Unsupported function") }; returnnew FunctionCallExpression(funcName, arguments); }
// Only ID tokens can have dot notation for table.column if (currentTokenType != TokenType.ID) { // Function tokens without parentheses are not valid thrownew Exception($"Function '{name}' must be followed by parentheses"); } // Column reference with table qualifier if (currentToken?.Type == TokenType.DOT) { NextToken(); if (currentToken?.Type != TokenType.ID) { thrownew Exception("Dot must be followed by column name"); } string columnName = currentToken.Lexeme; NextToken(); returnnew ColumnRefExpression(columnName, name); } returnnew ColumnRefExpression(name); } // Literals if (currentToken?.Type == TokenType.INT) { objectvalue = int.TryParse(currentToken.Lexeme, outvar v) ? v : currentToken.Lexeme; NextToken(); returnnew LiteralExpression(value); } if (currentToken?.Type == TokenType.FLOAT) { objectvalue = double.TryParse(currentToken.Lexeme, outvar v) ? v : currentToken.Lexeme; NextToken(); returnnew LiteralExpression(value); } if (currentToken?.Type == TokenType.STRING_LITERAL) { objectvalue = currentToken.Lexeme; NextToken(); returnnew LiteralExpression(value); } thrownew Exception($"Unable to parse expression, unknown token: {currentToken?.Lexeme}"); }