Notebook

Recursive-Descent Parser¶

Problem¶

Consider the following EBNF description of simple arithmetic expressions:

// <expr> → <term> | <term> + <term> | <term> - <term>
<expr> → <term> {( + | - ) <term>}     
// <term> → <factor> | <factor> * <factor> | <factor> / <factor> 
<term> → <factor> {( * | / ) <factor>} 
<factor> → id | int_lit | ( <expr> )

Suppose a lexical analyzer recognizes:

only arithmetic expressions which include operations: +, -, *, /
integer literals
parentheses
variable names which consist of strings of uppercase or lowercase letters
- Optional: might contain digits but must begin with a letter.
white spaces are ignored

Write a lexical and syntactic analyzers that process arithmetical expressions.

Example: (sum + 47) / total

Lexical Analyzer¶

Define STATE type that describes states of lexer finite automaton
Define TOKEN type that lists of token in our language
Provide functions for identifying character types: alphabet, number, special characters or white space

In [3]:

// Lexer finite-state automaton states
type STATE = START | IDENT | NUMBR | EOF
     
// Tokens
type TOKEN = INT_LIT | IDENT | ASSIGN_OP | ADD_OP | SUB_OP | MULT_OP | DIV_OP | LEFT_PAREN | RIGHT_PAREN | UNKNOWN | EOF

// Simple function that classifies a character as being alphabetic or not.
let Alpha = function
    | X when X >= 'a' && X <= 'z' -> true
    | X when X >= 'A' && X <= 'Z' -> true
    | _ -> false

// Simple function that classifies a character as being number or not.
let Number = function
    | X when X >= '0' && X <= '9' -> true
    | _ -> false
    
// Simple function that classifies a character as being white space or not.
let Space = function
    |' ' | '\t' -> true
    | _ -> false

Some active patterns for detecting binary operations and parentheses

In [4]:

// Simple function that classifies a character as being parentheses or not.
let (|Parent|_|) input =
    match input with
    |'(' -> Some(LEFT_PAREN)
    |')' -> Some(RIGHT_PAREN)
    | _  -> None

// Active pattern for detecting binary operations
let (|BinOp|_|) input =
    match input with
    |'+' -> Some(ADD_OP)
    |'-' -> Some(SUB_OP)
    |'/' -> Some(DIV_OP)
    |'*' -> Some(MULT_OP)
    |'=' -> Some(ASSIGN_OP)
    | _  -> None

// Append `char` to a `string`.
let append S C = S + C.ToString()

In [5]:

// Tokenizer: implementation of FSA
let rec tokenize ((source:seq<char>), state, lexeme, token) =
    if Seq.isEmpty source && state = STATE.EOF then
        (Seq.empty<char>, STATE.EOF, "", EOF)       // Last token in tokenizer, repeat it infinitly
    else if Seq.isEmpty source then
        (Seq.empty<char>, STATE.EOF, lexeme, token) // Indicate end of input
    else
        let C = Seq.head source
        let S = Seq.tail source
        match (C, state) with
        | (C       , STATE.START) when Space C   -> tokenize (S, state, "", token)
        | (BinOp Op, STATE.START)                -> (S, state, append "" C, Op)
        | (Parent P, STATE.START)                -> (S, state, append "" C, P)
        | (C       , STATE.START) when Alpha C   -> tokenize (S, STATE.IDENT, append "" C, TOKEN.IDENT)
        | (C       , STATE.START) when Number C  -> tokenize (S, STATE.NUMBR, append "" C, TOKEN.INT_LIT)
        | (C       , STATE.IDENT) when Alpha C   -> tokenize (S, state, append lexeme C, TOKEN.IDENT)
        | (C       , STATE.IDENT) when Space C   -> (S, STATE.START, lexeme, TOKEN.IDENT)
        | (BinOp Op, STATE.IDENT)                -> (source, STATE.START, lexeme, TOKEN.IDENT)
        | (Parent P, STATE.IDENT)                -> (source, STATE.START, lexeme, TOKEN.IDENT)
        | (C       , STATE.NUMBR) when Space C   -> (S, STATE.START, lexeme, TOKEN.INT_LIT)
        | (C       , STATE.NUMBR) when Number C  -> tokenize (S, state, append lexeme C, TOKEN.INT_LIT)
        | (BinOp Op, STATE.NUMBR)                -> (source, STATE.START, lexeme, TOKEN.INT_LIT)
        | (Parent P, STATE.NUMBR)                -> (source, STATE.START, lexeme, TOKEN.INT_LIT)
        | _                                      -> (Seq.empty<char>, STATE.EOF, append lexeme C, TOKEN.UNKNOWN)

In [6]:

// Form a sequence from a tokenizer calls
let token_sequence source =
    let state = tokenize (source, STATE.START, "", TOKEN.UNKNOWN)
    let unfolder current_state =
        let _, st, lex, tkn = current_state
        if tkn = TOKEN.EOF then
            None
        else
            let next_state = tokenize current_state
            Some(current_state, next_state)
    Seq.unfold unfolder state

In [10]:

// Test some inputs
let correct_input = "(prod + 37)/total"

for state in token_sequence correct_input do
    let _, st, lex, tkn = state
    printfn "State: %A,\tLexem: %s,\tToken: %A" st lex tkn

State: START,	Lexem: (,	Token: LEFT_PAREN
State: START,	Lexem: prod,	Token: IDENT
State: START,	Lexem: +,	Token: ADD_OP
State: START,	Lexem: 37,	Token: INT_LIT
State: START,	Lexem: ),	Token: RIGHT_PAREN
State: START,	Lexem: /,	Token: DIV_OP
State: EOF,	Lexem: total,	Token: IDENT

In [11]:

// This input contains error: `47?` is not a proper integer literal
let error_input = "(sum + 47?)/ total"

for state in token_sequence error_input do
    let _, st, lex, tkn = state
    printfn "State: %A,\tLexem: %s,\tToken: %A" st lex tkn

State: START,	Lexem: (,	Token: LEFT_PAREN
State: START,	Lexem: sum,	Token: IDENT
State: START,	Lexem: +,	Token: ADD_OP
State: EOF,	Lexem: 47?,	Token: UNKNOWN

Recursive-Descent Parsing¶

There is a subprogram for each nonterminal in the grammar, which can parse sentences that can be generated by that nonterminal
EBNF is ideally suited for being the basis for a recursive-descent parser, because EBNF minimizes the number of nonterminals

The coding process when there is only one RHS:

For each terminal symbol in the RHS, compare it with the next input token; if they match, continue, else there is an error
For each nonterminal symbol in the RHS, call its associated parsing subprogram

A nonterminal that has more than one RHS requires an initial process to determine which RHS it is to parse

The correct RHS is chosen on the basis of the next token of input (the lookahead)
The next token is compared with the first token that can be generated by each RHS until a match is found
If no match is found, it is a syntax error

In [12]:

let token (src, st, lex, tkn) = tkn // get a token from a context

let tokenize_print ctx =            // call tokenizer, print and return acquired token
    let newctx = tokenize ctx
    let _, st, lex, tkn = newctx
    printfn "%A: %s" tkn lex
    newctx

Implementation of a recursive-descent parser based of following grammar:

<expr> → <term> {( + | - ) <term>}
<term> → <factor> {( * | / ) <factor>}
<factor> → id | int_lit | ( <expr> )

Implement expr, term & factor functions

In [13]:

// Parses strings in the language generated by the rule:
//     <factor> -> id | int_constant | ( <expr> )
let rec factor context = 
    printfn "Enter <factor>"

    // Determine which RHS
    let next_context = 
        let tkn = token context
        if tkn = TOKEN.IDENT || tkn = INT_LIT then
            // Get the next token
            tokenize_print context
        else
            // If the RHS is (<expr>), pass over the left parenthesis,
            // call expr, and check for the right parenthesis
            if tkn = TOKEN.LEFT_PAREN then                
                let expr_context = context |> tokenize_print  |> expr 
                if (token expr_context) = TOKEN.RIGHT_PAREN then                    
                    tokenize_print expr_context
                else
                    failwith "No right parenthesis"
            else
                failwith "Not an id, an integer literal, or a left parenthesis"                                    

    printfn "Exit <factor>"
    next_context

// As long as the next token is * or /, get the next token and parse the next factor
and next_factor context = 
    match (token context) with
    | TOKEN.MULT_OP | TOKEN.DIV_OP -> context |> tokenize_print |> factor |> next_factor
    | _ -> context

// Parses strings in the language generated by the rule:
//     <term> -> <factor> {(* | /) <factor>)
and term context =    
    printfn "Enter <term>"
    let next_context = context |> factor |> next_factor
    printfn "Exit <term>"
    next_context        

// As long as the next token is + or -, get the next token and parse the next term
and next_term context = 
    match (token context) with
    | TOKEN.ADD_OP | TOKEN.SUB_OP ->  context |> tokenize_print |> term |> next_term
    | _ -> context

// Parses strings in the language generated by the rule:
//     <expr> -> <term> {(+ | -) <term>}
and expr context =
    printfn "Enter <expr>"
    let next_context = context |> term  |> next_term         
    printfn "Exit <expr>"
    next_context

// Parser function
let parser source =
    printfn "Start parsing: %s" source    
    tokenize_print (source, STATE.START, "", TOKEN.UNKNOWN) |> expr |> ignore

In [16]:

tokenize_print (correct_input, STATE.START, "", TOKEN.UNKNOWN)

LEFT_PAREN: (

Out[16]:

Item1	Item2	Item3	Item4
[ p, r, o, d, , +, , 3, 7, ), /, t, o, t, a, l ]	START	(	LEFT_PAREN

In [15]:

correct_input

Out[15]:

(prod + 37)/total

In [17]:

parser correct_input

Start parsing: (prod + 37)/total
LEFT_PAREN: (
Enter <expr>
Enter <term>
Enter <factor>
IDENT: prod
Enter <expr>
Enter <term>
Enter <factor>
ADD_OP: +
Exit <factor>
Exit <term>
INT_LIT: 37
Enter <term>
Enter <factor>
RIGHT_PAREN: )
Exit <factor>
Exit <term>
Exit <expr>
DIV_OP: /
Exit <factor>
IDENT: total
Enter <factor>
EOF: 
Exit <factor>
Exit <term>
Exit <expr>

In [18]:

try
    parser error_input
with
 | Failure(msg) -> printfn "Error: %s" msg

Start parsing: (sum + 47?)/ total
LEFT_PAREN: (
Enter <expr>
Enter <term>
Enter <factor>
IDENT: sum
Enter <expr>
Enter <term>
Enter <factor>
ADD_OP: +
Exit <factor>
Exit <term>
UNKNOWN: 47?
Enter <term>
Enter <factor>
Error: Not an id, an integer literal, or a left parenthesis

In [21]:

try
    parser "a +- b"
with
 | Failure(msg) -> printfn "Error: %s" msg

Start parsing: a +- b
IDENT: a
Enter <expr>
Enter <term>
Enter <factor>
ADD_OP: +
Exit <factor>
Exit <term>
SUB_OP: -
Enter <term>
Enter <factor>
Error: Not an id, an integer literal, or a left parenthesis