added a simple handmade tokenizer

2025-09-15 15:58:41 -04:00 · 2016-07-03 11:54:22 +02:00 · 2016-07-03 11:54:22 +02:00 · afa24f6979
commit afa24f6979
parent 8232f31829
3 changed files with 196 additions and 47 deletions
--- a/src/main/java/de/neemann/digital/analyse/parser/Parser.java
+++ b/src/main/java/de/neemann/digital/analyse/parser/Parser.java
@ -8,11 +8,9 @@ import de.neemann.digital.lang.Lang;

 import java.io.IOException;
 import java.io.Reader;
-import java.io.StreamTokenizer;
 import java.io.StringReader;

-import static java.io.StreamTokenizer.TT_EOF;
-import static java.io.StreamTokenizer.TT_WORD;
+import static de.neemann.digital.analyse.parser.Tokenizer.Token.*;

 /**
 * Class to parse a string to an expression
@ -21,10 +19,11 @@ import static java.io.StreamTokenizer.TT_WORD;
 */
 public class Parser {

-    private final StreamTokenizer tokenizer;
+    private final Tokenizer tokenizer;

    /**
     * Creates a new instance
+     *
     * @param expression the string to parse
     */
    public Parser(String expression) {
@ -33,52 +32,31 @@ public class Parser {

    /**
     * Creates a new instance
+     *
     * @param reader the reader to read the expression
     */
    public Parser(Reader reader) {
-        tokenizer = new StreamTokenizer(reader);
-        tokenizer.wordChars('_', '_');
-        tokenizer.wordChars('^', '^');
-        tokenizer.wordChars('0', '9');
-//        tokenizer.ordinaryChar('∧'); StreamTokenizer can not handle ordinary chars > 255
-//        tokenizer.ordinaryChar('∨');
+        tokenizer = new Tokenizer(reader);
    }

-    private boolean isNext(String str) throws IOException {
-        int t = tokenizer.nextToken();
-        if (t == TT_WORD && tokenizer.sval.equalsIgnoreCase(str))
-            return true;
-
-        tokenizer.pushBack();
-        return false;
-    }
-
-    private boolean isNext(int c) throws IOException {
-        int t = tokenizer.nextToken();
-        if (t == c)
-            return true;
-
-        tokenizer.pushBack();
-        return false;
-    }
-
-
    /**
     * Parses the the string expression and returns a expression instance
+     *
     * @return the expresion instance
-     * @throws IOException IOException
+     * @throws IOException    IOException
     * @throws ParseException ParseException
     */
    public Expression parse() throws IOException, ParseException {
        Expression expr = parseOr();
-        if (!isNext(TT_EOF))
+        if (!(tokenizer.next() == EOF))
            throw new ParseException(Lang.get("err_parserUnexpectedEndOfExpression"));
        return expr;
    }

    private Expression parseOr() throws IOException, ParseException {
        Expression ex = parseAnd();
-        while (isNext('+') || isNext("∨") || isNext('|')) {
+        while (tokenizer.peek() == OR) {
+            tokenizer.next();
            ex = Operation.or(ex, parseAnd());
        }
        return ex;
@ -86,24 +64,26 @@ public class Parser {

    private Expression parseAnd() throws IOException, ParseException {
        Expression ex = parseSimpleExp();
-        while (isNext('*') || isNext("∧") || isNext('&')) {
+        while (tokenizer.peek() == AND) {
+            tokenizer.next();
            ex = Operation.and(ex, parseSimpleExp());
        }
        return ex;
    }

    private Expression parseSimpleExp() throws IOException, ParseException {
-        if (isNext('!')) {
-            return Not.not(parseSimpleExp());
-        } else if (isNext('(')) {
-            Expression exp = parseOr();
-            if (!isNext(')'))
-                throw new ParseException(Lang.get("err_parserMissingClosedParenthesis"));
-            return exp;
-        } else if (isNext(TT_WORD)) {
-            return new Variable(tokenizer.sval);
-        } else
-            throw new ParseException(Lang.get("err_parserUnexpectedToken_N", tokenizer.sval));
+        switch (tokenizer.next()) {
+            case NOT:
+                return Not.not(parseSimpleExp());
+            case OPEN:
+                Expression exp = parseOr();
+                if (!(tokenizer.next() == CLOSE))
+                    throw new ParseException(Lang.get("err_parserMissingClosedParenthesis"));
+                return exp;
+            case IDENT:
+                return new Variable(tokenizer.getIdent());
+            default:
+                throw new ParseException(Lang.get("err_parserUnexpectedToken_N", tokenizer.toString()));
+        }
    }
-
 }
--- a/src/main/java/de/neemann/digital/analyse/parser/Tokenizer.java
+++ b/src/main/java/de/neemann/digital/analyse/parser/Tokenizer.java
@ -0,0 +1,157 @@
+package de.neemann.digital.analyse.parser;
+
+import java.io.IOException;
+import java.io.Reader;
+
+/**
+ * @author hneemann
+ */
+public class Tokenizer {
+
+
+    enum Token {UNKNOWN, IDENT, AND, OR, NOT, OPEN, CLOSE, EOF}
+
+    private final Reader in;
+    private Token token;
+    private boolean isToken;
+    private StringBuilder builder;
+    private boolean isUnreadChar = false;
+    private int unreadChar;
+
+    /**
+     * Creates a new instance
+     *
+     * @param in the reader
+     */
+    public Tokenizer(Reader in) {
+        this.in = in;
+        token = Token.UNKNOWN;
+        isToken = false;
+        builder = new StringBuilder();
+    }
+
+    /**
+     * Reads the next token
+     *
+     * @return the token
+     * @throws IOException IOException
+     */
+    public Token next() throws IOException {
+        peek();
+        isToken = false;
+        return token;
+    }
+
+    /**
+     * peeks the next token.
+     * The token is kept in the stream, so next will return this token again!
+     *
+     * @return the token
+     * @throws IOException IOException
+     */
+    public Token peek() throws IOException {
+        if (isToken)
+            return token;
+
+        int c;
+        do {
+            c = readChar();
+        } while (isWhiteSpace(c));
+
+        switch (c) {
+            case -1:
+                token = Token.EOF;
+                break;
+            case '(':
+                token = Token.OPEN;
+                break;
+            case ')':
+                token = Token.CLOSE;
+                break;
+            case '&':
+                c = readChar();
+                if (c != '&') unreadChar(c);
+            case '*':
+            case '∧':
+                token = Token.AND;
+                break;
+            case '|':
+                c = readChar();
+                if (c != '|') unreadChar(c);
+            case '+':
+            case '∨':
+                token = Token.OR;
+                break;
+            case '¬':
+            case '!':
+                token = Token.NOT;
+                break;
+            default:
+                if (isIdentChar(c)) {
+                    token = Token.IDENT;
+                    builder.setLength(0);
+                    builder.append((char) c);
+                    boolean wasChar = true;
+                    do {
+                        c = readChar();
+                        if (isIdentChar(c) || isNumberChar(c)) {
+                            builder.append((char) c);
+                        } else {
+                            unreadChar(c);
+                            wasChar = false;
+                        }
+                    } while (wasChar);
+                } else {
+                    token = Token.UNKNOWN;
+                    builder.setLength(0);
+                    builder.append((char) c);
+                }
+        }
+
+        isToken = true;
+        return token;
+    }
+
+    /**
+     * @return the identifier
+     */
+    public String getIdent() {
+        return builder.toString();
+    }
+
+    private int readChar() throws IOException {
+        if (isUnreadChar) {
+            isUnreadChar = false;
+            return unreadChar;
+        } else
+            return in.read();
+    }
+
+    private void unreadChar(int c) {
+        unreadChar = c;
+        isUnreadChar = true;
+    }
+
+    private boolean isIdentChar(int c) {
+        return (c >= 'a' && c <= 'z')
+                || (c >= 'A' && c <= 'Z')
+                || (c == '_')
+                || (c == '^');
+    }
+
+    private boolean isNumberChar(int c) {
+        return (c >= '0' && c <= '9');
+    }
+
+    private boolean isWhiteSpace(int c) {
+        return c == ' ' || c == '\n' || c == '\r' || c == '\t';
+    }
+
+    @Override
+    public String toString() {
+        if (token == Token.IDENT || token == Token.UNKNOWN)
+            return getIdent();
+        else
+            return token.name();
+    }
+}
--- a/src/test/java/de/neemann/digital/analyse/parser/ParserTest.java
+++ b/src/test/java/de/neemann/digital/analyse/parser/ParserTest.java
@ -22,14 +22,16 @@ public class ParserTest extends TestCase {

    public void testParseOr() throws Exception {
        assertTrue(new Parser("a+b").parse() instanceof Operation.Or);
-        assertTrue(new Parser("a ∨ b").parse() instanceof Operation.Or);
+        assertTrue(new Parser("a∨b").parse() instanceof Operation.Or);
        assertTrue(new Parser("a|b").parse() instanceof Operation.Or);
+        assertTrue(new Parser("a||b").parse() instanceof Operation.Or);
    }

    public void testParseAnd() throws Exception {
        assertTrue(new Parser("a*b").parse() instanceof Operation.And);
-        assertTrue(new Parser("a ∧ b").parse() instanceof Operation.And);
+        assertTrue(new Parser("a∧b").parse() instanceof Operation.And);
        assertTrue(new Parser("a&b").parse() instanceof Operation.And);
+        assertTrue(new Parser("a&&b").parse() instanceof Operation.And);
    }

    public void testParseParenthesis() throws Exception {
@ -111,5 +113,15 @@ public class ParserTest extends TestCase {
        }
    }

+    public void testParseException5() throws Exception {
+        Parser p = new Parser("ö");
+        try {
+            p.parse();
+            assertTrue(false);
+        } catch (ParseException e) {
+            assertTrue(true);
+        }
+    }
+

 }