From 6cfdf24fb4efe1b36a6ce244bba132ba812e25a9 Mon Sep 17 00:00:00 2001 From: Nao Pross Date: Wed, 5 Feb 2020 15:37:45 +0100 Subject: Rewrite lexer --- src/parser.rs | 153 +++++++++++++++++++++++++++------------------------------- 1 file changed, 72 insertions(+), 81 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index c57fe14..68b1c11 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1,108 +1,99 @@ use crate::journal::Journal; use std::fs; -use std::iter::Peekable; -extern crate itertools; extern crate time; -use itertools::Itertools; use time::Date; #[derive(Debug, Eq, PartialEq, Clone)] -pub enum Token { - Word(String), - DateSep, - AccountSep, - DecimalSep, - Newline, - Space, - Indent, - Marker(char), - Comment(char), - Numeric(String), +pub enum Token<'s> { + Word(&'s str), Num(&'s str), + DecimalSep(char), + Newline, Space, Indent, + OpenKet(char), CloseKet(char), + Quote, Marker(char), CommentMarker, } -pub struct Lexer> { - iter: Peekable + +struct Lexer<'s> { + text: &'s str, + tokens: Vec> } -impl> Lexer { - pub fn new(iter: I) -> Lexer { +impl<'s> Lexer<'s> { + fn new(input: &str) -> Lexer { Lexer { - iter: iter.peekable() + text: input, + tokens: Vec::new(), } } -} -impl> Iterator for Lexer { - type Item = Token; - - fn next(&mut self) -> Option { - // let ch = *self.iter.peek().unwrap_or(&'`'); - let ch = self.iter.peek() - match ch { - /* alphanumeric */ - c if c.is_alphabetic() => { - Some(Token::Word(self.iter.by_ref() - .peeking_take_while(|&c| c.is_alphabetic()).collect())) }, - c if c.is_numeric() => { - Some(Token::Numeric(self.iter.by_ref() - .peeking_take_while(|&c| c.is_numeric()).collect())) - }, - /* whitespace */ - ' ' => { - self.iter.next(); - Some(Token::Space) - }, - '\n' => { - self.iter.next(); - Some(Token::Newline) - }, - '\t' => { - self.iter.next(); - Some(Token::Indent) - }, - /* separators */ - '/' => { - self.iter.next(); - Some(Token::DateSep) - }, - ':' => { - self.iter.next(); - Some(Token::AccountSep) - }, - ',' | '.' => { - self.iter.next(); - Some(Token::DecimalSep) - }, - /* comments */ - ';' | '#' | '%' => { - self.iter.next(); - Some(Token::Comment(ch)) - }, - /* markers */ - '*' | '!' | '@' | '-' => { - self.iter.next(); - Some(Token::Marker(ch)) - }, - '`' => { - println!("--"); - None - }, - _ => self.next(), + fn lex(&mut self) { + #[derive(Clone,Copy)] + enum Predicate { + Alphabetic, + Numeric, + } + + let mut start = 0; + let mut lastp: Option = None; + + for (i, ch) in self.text.char_indices() { + let token: Option = match ch { + '\t' => Some(Token::Indent), + '\n' => Some(Token::Newline), + c if c.is_whitespace() => Some(Token::Space), + + '"' => Some(Token::Quote), + ',' | '.' => Some(Token::DecimalSep(ch)), + + '(' | '[' | '{' => Some(Token::OpenKet(ch)), + ')' | ']' | '}' => Some(Token::CloseKet(ch)), + + ';' | '#' | '%' => Some(Token::CommentMarker), + '*' | '!' | '@' | '-' | '/' | ':' => Some(Token::Marker(ch)), + + c if c.is_alphabetic() => { + lastp = Some(Predicate::Alphabetic); + None + }, + c if c.is_numeric() => { + lastp = Some(Predicate::Numeric); + None + }, + _ => { + println!("Invalid syntax token: {}", ch); + None + } + }; + + if let Some(t) = token { + if let Some(p) = lastp { + if i != start { + self.tokens.push(match p { + Predicate::Alphabetic => Token::Word(&self.text[start..i]), + Predicate::Numeric => Token::Num(&self.text[start..i]), + }); + } + } + + self.tokens.push(t); + start = i +1; + } } } -} + fn tokenize(input: & str) -> Vec { + let mut lexer = Lexer::new(input); + lexer.lex(); -pub fn lex(text: &str) -> Vec { - Lexer::new(text.chars()).collect() + lexer.tokens + } } - struct Parser { } @@ -112,7 +103,7 @@ pub fn parse(name: &str) -> Journal { println!("{:?}", text); - for token in lex(&text) { + for token in Lexer::tokenize(&text) { println!("{:?}", token); } -- cgit v1.2.1