Consider the following EBNF description of simple arithmetic expressions:
// <expr> → <term> | <term> + <term> | <term> - <term>
<expr> → <term> {( + | - ) <term>}
// <term> → <factor> | <factor> * <factor> | <factor> / <factor>
<term> → <factor> {( * | / ) <factor>}
<factor> → id | int_lit | ( <expr> )
Suppose a lexical analyzer recognizes:
+
, -
, *
, /
Write a lexical and syntactic analyzers that process arithmetical expressions.
STATE
type that describes states of lexer finite automatonTOKEN
type that lists of token in our language// Lexer finite-state automaton states
type STATE = START | IDENT | NUMBR | EOF
// Tokens
type TOKEN = INT_LIT | IDENT | ASSIGN_OP | ADD_OP | SUB_OP | MULT_OP | DIV_OP | LEFT_PAREN | RIGHT_PAREN | UNKNOWN | EOF
// Simple function that classifies a character as being alphabetic or not.
let Alpha = function
| X when X >= 'a' && X <= 'z' -> true
| X when X >= 'A' && X <= 'Z' -> true
| _ -> false
// Simple function that classifies a character as being number or not.
let Number = function
| X when X >= '0' && X <= '9' -> true
| _ -> false
// Simple function that classifies a character as being white space or not.
let Space = function
|' ' | '\t' -> true
| _ -> false
Some active patterns for detecting binary operations and parentheses
// Simple function that classifies a character as being parentheses or not.
let (|Parent|_|) input =
match input with
|'(' -> Some(LEFT_PAREN)
|')' -> Some(RIGHT_PAREN)
| _ -> None
// Active pattern for detecting binary operations
let (|BinOp|_|) input =
match input with
|'+' -> Some(ADD_OP)
|'-' -> Some(SUB_OP)
|'/' -> Some(DIV_OP)
|'*' -> Some(MULT_OP)
|'=' -> Some(ASSIGN_OP)
| _ -> None
// Append `char` to a `string`.
let append S C = S + C.ToString()
// Tokenizer: implementation of FSA
let rec tokenize ((source:seq<char>), state, lexeme, token) =
if Seq.isEmpty source && state = STATE.EOF then
(Seq.empty<char>, STATE.EOF, "", EOF) // Last token in tokenizer, repeat it infinitly
else if Seq.isEmpty source then
(Seq.empty<char>, STATE.EOF, lexeme, token) // Indicate end of input
else
let C = Seq.head source
let S = Seq.tail source
match (C, state) with
| (C , STATE.START) when Space C -> tokenize (S, state, "", token)
| (BinOp Op, STATE.START) -> (S, state, append "" C, Op)
| (Parent P, STATE.START) -> (S, state, append "" C, P)
| (C , STATE.START) when Alpha C -> tokenize (S, STATE.IDENT, append "" C, TOKEN.IDENT)
| (C , STATE.START) when Number C -> tokenize (S, STATE.NUMBR, append "" C, TOKEN.INT_LIT)
| (C , STATE.IDENT) when Alpha C -> tokenize (S, state, append lexeme C, TOKEN.IDENT)
| (C , STATE.IDENT) when Space C -> (S, STATE.START, lexeme, TOKEN.IDENT)
| (BinOp Op, STATE.IDENT) -> (source, STATE.START, lexeme, TOKEN.IDENT)
| (Parent P, STATE.IDENT) -> (source, STATE.START, lexeme, TOKEN.IDENT)
| (C , STATE.NUMBR) when Space C -> (S, STATE.START, lexeme, TOKEN.INT_LIT)
| (C , STATE.NUMBR) when Number C -> tokenize (S, state, append lexeme C, TOKEN.INT_LIT)
| (BinOp Op, STATE.NUMBR) -> (source, STATE.START, lexeme, TOKEN.INT_LIT)
| (Parent P, STATE.NUMBR) -> (source, STATE.START, lexeme, TOKEN.INT_LIT)
| _ -> (Seq.empty<char>, STATE.EOF, append lexeme C, TOKEN.UNKNOWN)
// Form a sequence from a tokenizer calls
let token_sequence source =
let state = tokenize (source, STATE.START, "", TOKEN.UNKNOWN)
let unfolder current_state =
let _, st, lex, tkn = current_state
if tkn = TOKEN.EOF then
None
else
let next_state = tokenize current_state
Some(current_state, next_state)
Seq.unfold unfolder state
// Test some inputs
let correct_input = "(prod + 37)/total"
for state in token_sequence correct_input do
let _, st, lex, tkn = state
printfn "State: %A,\tLexem: %s,\tToken: %A" st lex tkn
State: START, Lexem: (, Token: LEFT_PAREN State: START, Lexem: prod, Token: IDENT State: START, Lexem: +, Token: ADD_OP State: START, Lexem: 37, Token: INT_LIT State: START, Lexem: ), Token: RIGHT_PAREN State: START, Lexem: /, Token: DIV_OP State: EOF, Lexem: total, Token: IDENT
// This input contains error: `47?` is not a proper integer literal
let error_input = "(sum + 47?)/ total"
for state in token_sequence error_input do
let _, st, lex, tkn = state
printfn "State: %A,\tLexem: %s,\tToken: %A" st lex tkn
State: START, Lexem: (, Token: LEFT_PAREN State: START, Lexem: sum, Token: IDENT State: START, Lexem: +, Token: ADD_OP State: EOF, Lexem: 47?, Token: UNKNOWN
The coding process when there is only one RHS:
A nonterminal that has more than one RHS requires an initial process to determine which RHS it is to parse
let token (src, st, lex, tkn) = tkn // get a token from a context
let tokenize_print ctx = // call tokenizer, print and return acquired token
let newctx = tokenize ctx
let _, st, lex, tkn = newctx
printfn "%A: %s" tkn lex
newctx
Implementation of a recursive-descent parser based of following grammar:
<expr> → <term> {( + | - ) <term>}
<term> → <factor> {( * | / ) <factor>}
<factor> → id | int_lit | ( <expr> )
expr
, term
& factor
functions// Parses strings in the language generated by the rule:
// <factor> -> id | int_constant | ( <expr> )
let rec factor context =
printfn "Enter <factor>"
// Determine which RHS
let next_context =
let tkn = token context
if tkn = TOKEN.IDENT || tkn = INT_LIT then
// Get the next token
tokenize_print context
else
// If the RHS is (<expr>), pass over the left parenthesis,
// call expr, and check for the right parenthesis
if tkn = TOKEN.LEFT_PAREN then
let expr_context = context |> tokenize_print |> expr
if (token expr_context) = TOKEN.RIGHT_PAREN then
tokenize_print expr_context
else
failwith "No right parenthesis"
else
failwith "Not an id, an integer literal, or a left parenthesis"
printfn "Exit <factor>"
next_context
// As long as the next token is * or /, get the next token and parse the next factor
and next_factor context =
match (token context) with
| TOKEN.MULT_OP | TOKEN.DIV_OP -> context |> tokenize_print |> factor |> next_factor
| _ -> context
// Parses strings in the language generated by the rule:
// <term> -> <factor> {(* | /) <factor>)
and term context =
printfn "Enter <term>"
let next_context = context |> factor |> next_factor
printfn "Exit <term>"
next_context
// As long as the next token is + or -, get the next token and parse the next term
and next_term context =
match (token context) with
| TOKEN.ADD_OP | TOKEN.SUB_OP -> context |> tokenize_print |> term |> next_term
| _ -> context
// Parses strings in the language generated by the rule:
// <expr> -> <term> {(+ | -) <term>}
and expr context =
printfn "Enter <expr>"
let next_context = context |> term |> next_term
printfn "Exit <expr>"
next_context
// Parser function
let parser source =
printfn "Start parsing: %s" source
tokenize_print (source, STATE.START, "", TOKEN.UNKNOWN) |> expr |> ignore
tokenize_print (correct_input, STATE.START, "", TOKEN.UNKNOWN)
LEFT_PAREN: (
Item1 | Item2 | Item3 | Item4 |
---|---|---|---|
[ p, r, o, d, , +, , 3, 7, ), /, t, o, t, a, l ] | START | ( | LEFT_PAREN |
correct_input
(prod + 37)/total
parser correct_input
Start parsing: (prod + 37)/total LEFT_PAREN: ( Enter <expr> Enter <term> Enter <factor> IDENT: prod Enter <expr> Enter <term> Enter <factor> ADD_OP: + Exit <factor> Exit <term> INT_LIT: 37 Enter <term> Enter <factor> RIGHT_PAREN: ) Exit <factor> Exit <term> Exit <expr> DIV_OP: / Exit <factor> IDENT: total Enter <factor> EOF: Exit <factor> Exit <term> Exit <expr>
try
parser error_input
with
| Failure(msg) -> printfn "Error: %s" msg
Start parsing: (sum + 47?)/ total LEFT_PAREN: ( Enter <expr> Enter <term> Enter <factor> IDENT: sum Enter <expr> Enter <term> Enter <factor> ADD_OP: + Exit <factor> Exit <term> UNKNOWN: 47? Enter <term> Enter <factor> Error: Not an id, an integer literal, or a left parenthesis
try
parser "a +- b"
with
| Failure(msg) -> printfn "Error: %s" msg
Start parsing: a +- b IDENT: a Enter <expr> Enter <term> Enter <factor> ADD_OP: + Exit <factor> Exit <term> SUB_OP: - Enter <term> Enter <factor> Error: Not an id, an integer literal, or a left parenthesis