参加ThoughtWorks University的一个来月没啥事情,闲了写写compiler玩。发现Lexer部分比较基础也比较常用,有很多相似的东西,每次都要写一遍也太麻烦了,下面是我按着JSL写的一个common java-like lexer,对于大多数接近java语法的语言估计是够用了。BTW:这个Lexer定义是TDD出来,以通过测试为要务,可能可读性不太强。
1.WhiteSpace
1 WhiteSpace
2 : (' ' // ASCII SP
3 | '\t' // ASCII HT
4 | '\f' // ASCII FF
5 | LineTerminator {newline();}
6 )+{$setType(Token.SKIP);}
7 ;
8 protected LineTerminator
9 options {generateAmbigWarnings=false;}
10 : '\n' // ASCII LF
11 | '\r' // ASCII CR
12 | "\r\n" // ASCII CR followed ASCII LF
13 ;
2.Comments
1 Comment
2 : (SingleLineComment | MultiLineComment)
3 {$setType(Token.SKIP);}
4 ;
5 protected SingleLineComment
6 : "//" (~('\n'|'\r'))* (LineTerminator{newline();})?
7 ;
8 protected MultiLineComment
9 : "/*"
10 (~('\n'|'\r'|'*') | LineTerminator{newline();})*
11 "*/"
12 ;
3.Escape Sequences
1 protected EscapeSequence
2 :'\\'!
3 ('n' {$setText("\n");}
4 |'r' {$setText("\r");}
5 |'t' {$setText("\t");}
6 |'b' {$setText("\b");}
7 |'f' {$setText("\f");}
8 |'"'
9 |'\''
10 |'\\'
11 // octal escape
12 |'0'..'3'
13 ( options { warnWhenFollowAmbig = false; }: '0'..'7'
14 ( options { warnWhenFollowAmbig = false; }: '0'..'7')?)?
15 {char c = (char)Integer.parseInt($getText,8); $setText(c);}
16 |'4'..'7'
17 ( options { warnWhenFollowAmbig = false; }: '0'..'7' )?
18 {char c = (char)Integer.parseInt($getText,8); $setText(c);}
19 )
20 | ("\\u") => UnicodeEscape
21 ;
22 protected UnicodeEscape
23 : '\\'! ('u')+{$setText("");} HexDigit HexDigit HexDigit HexDigit
24 {char c = (char)Integer.parseInt($getText,16); $setText(c);}
25 ;
26 protected HexDigit: '0'..'9' | 'a'..'f' | 'A'..'F';
27
这个东西比较麻烦,种类很多,有像\t \n \r这样的escape,也有\uuu1234这样的unicode escape,还有octal escape,说实话,这个东西还是这次写compiler的时候新发现的,以前还真不知道有这么个东西,也从来没用过...汗啊...octal escape是对于小于255的数,可以用\012这样的八进制数表示,这个东西没想明白有什么用。反正JSL上写了,就按这个来吧。
4. String & Character Literal
1 StringLiteral
2 : '"'! (EscapeSequence|~'"')* '"'!
3 ;
4 CharacterLiteral
5 : '\''! (EscapeSequence|~'"')? '\''!
6 ;
5. NumericLiteral
1 NumericLiteral
2 options{testLiterals = true;}
3 {int type = 0;}
4 : ((".end") => type = EndOfDirective
5 |(".max") => type = MaxDirective
6 |('.' 'a'..'z') => type = Directives
7 | ('+'! | '-')? (type = IntegerLiteral | type = HexIntegerLiteral | type = DoubleLiteral)
8 )
9 {$setType(type);}
10 ;
11
26 protected IntegerLiteral
27 returns [int type = 0]
28 {$setType(DecimalIntegerLiteral);}
29 : ('0'
30 | '0'! ( '0'..'7' {$setType(OctalIntegerLiteral);})+
31 | '1'..'9' ('0'..'9')*)
32 ((LongTypeSuffix! {
33 if (_ttype == OctalIntegerLiteral)
34 $setType(OctalLongLiteral);
35 else
36 $setType(DecimalLongLiteral);
37 }) ?
38 | {_ttype == DecimalIntegerLiteral}?
39 (FloatingPointPart | ExponentPart) {$setType(DoubleLiteral);}
40 (DoubleTypeSuffix! | FloatTypeSuffix!{$setType(FloatLiteral);})?
41 ){type = _ttype;}
42 ;
43 protected HexIntegerLiteral
44 returns [int type = 0]
45 : ('0'! ('x'! | 'X'!) (HexDigit)+
46 (LongTypeSuffix! {$setType(HexLongLiteral);}) ?)
47 {type = _ttype;}
48 ;
49 protected DoubleLiteral
50 returns [int type = 0]
51 : (FloatingPointPart (DoubleTypeSuffix! | FloatTypeSuffix!{$setType(FloatLiteral);})?)
52 {type = _ttype;}
53 ;
54 protected FloatingPointPart
55 : '.' ('0'..'9')+ (ExponentPart)?
56 ;
57 protected ExponentPart
58 : ('E'|'e') ('+'|'-')? ('0'..'9')+
59 ;
60 protected LongTypeSuffix : 'l' | 'L';
61 protected DoubleTypeSuffix : 'd' | 'D';
62 protected FloatTypeSuffix : 'f' | 'F';
这个是最复杂的一部分...
Unit Test比较长,节选吧
1 public void testShouldIgnoreWhiteSpaces() throws Exception {
2 assertRecognized(OctaneTokenTypes.EOF, " ");
3 assertRecognized(OctaneTokenTypes.EOF, "\t");
4 assertRecognized(OctaneTokenTypes.EOF, "\f");
5 }
6
7 public void testShouldIgnoreLineTerminators() throws Exception {
8 assertRecognized(OctaneTokenTypes.EOF, "\r");
9 assertRecognized(OctaneTokenTypes.EOF, "\n");
10 assertRecognized(OctaneTokenTypes.EOF, "\r\n");
11 }
12
13 public void testShouldIgnoreSingleLineComment() throws Exception {
14 assertRecognized(OctaneTokenTypes.EOF, "// comments 1234 &*^$\n");
15 }
16
17 public void testShouldIgnoreMultiLineComment() throws Exception {
18 assertRecognized(OctaneLexer.EOF, "/* comment line 1\ncomment line 2\n*/");
19 }
20
21 public void testShouldIncreaseLineNumberIfLineTerminatorsGiven() throws Exception {
22 assertEquals(2, createLexer("\r").nextToken().getLine());
23 assertEquals(2, createLexer("\n").nextToken().getLine());
24 assertEquals(2, createLexer("\r\n").nextToken().getLine());
25 }
26
27 public void testShouldRecognizeBasicEscapeInCharacterLiteral() throws Exception {
28 assertRecognized(OctaneTokenTypes.CharacterLiteral, "\n", "'\\n'");
29 assertRecognized(OctaneTokenTypes.CharacterLiteral, "\r", "'\\r'");
30 assertRecognized(OctaneTokenTypes.CharacterLiteral, "\t", "'\\t'");
31 assertRecognized(OctaneTokenTypes.CharacterLiteral, "\b", "'\\b'");
32 assertRecognized(OctaneTokenTypes.CharacterLiteral, "\f", "'\\f'");
33 assertRecognized(OctaneTokenTypes.CharacterLiteral, "\"", "'\\\"'");
34 assertRecognized(OctaneTokenTypes.CharacterLiteral, "\\", "'\\\\'");
35 assertRecognized(OctaneTokenTypes.CharacterLiteral, "\'", "'\\\''");
36 }
37
38 public void testShouldRecognizeBasicEscapeInStringLiteral() throws Exception {
39 assertRecognized(OctaneTokenTypes.StringLiteral, "\n", "\"\\n\"");
40 assertRecognized(OctaneTokenTypes.StringLiteral, "\r", "\"\\r\"");
41 assertRecognized(OctaneTokenTypes.StringLiteral, "\t", "\"\\t\"");
42 assertRecognized(OctaneTokenTypes.StringLiteral, "\b", "\"\\b\"");
43 assertRecognized(OctaneTokenTypes.StringLiteral, "\f", "\"\\f\"");
44 assertRecognized(OctaneTokenTypes.StringLiteral, "\"", "\"\\\"\"");
45 assertRecognized(OctaneTokenTypes.StringLiteral, "\\", "\"\\\\\"");
46 assertRecognized(OctaneTokenTypes.StringLiteral, "\'", "\"\\\'\"");
47 }
48
49 public void testShouldRecognizeOctalEscapeInCharacterLiteral() throws Exception {
50 assertRecognized(OctaneTokenTypes.CharacterLiteral, "\077", "'\\077'");
51 assertRecognized(OctaneTokenTypes.CharacterLiteral, "\77", "'\\77'");
52 assertRecognized(OctaneTokenTypes.CharacterLiteral, "\37", "'\\37'");
53 assertRecognized(OctaneTokenTypes.CharacterLiteral, "\7", "'\\7'");
54 }
55
56 public void testShouldRecognizeOctalEscapeInStringLiteral() throws Exception {
57 assertRecognized(OctaneTokenTypes.StringLiteral, "\077", "\"\\077\"");
58 assertRecognized(OctaneTokenTypes.StringLiteral, "\77", "\"\\77\"");
59 assertRecognized(OctaneTokenTypes.StringLiteral, "\37", "\"\\37\"");
60 assertRecognized(OctaneTokenTypes.StringLiteral, "\7", "\"\\7\"");
61 }
62
63 public void testShouldRecognizeUnicodeEscapeInCharacterLiteral() throws Exception {
64 assertRecognized(OctaneTokenTypes.CharacterLiteral, "\u1234", "'\\u1234'");
65 assertRecognized(OctaneTokenTypes.CharacterLiteral, "\uu1234","'\\uu1234\'");
66 }
67
68 public void testShouldRecognizeUnicodeEscapeInStringLiteral() throws Exception {
69 assertRecognized(OctaneTokenTypes.StringLiteral, "\u1234", "\"\\u1234\"");
70 assertRecognized(OctaneTokenTypes.StringLiteral, "\uu1234", "\"\\uu1234\"");
71 }
72
73 public void testShouldRecognizeUnicodeInStringLiteral() throws Exception {
74 assertRecognized(OctaneTokenTypes.StringLiteral, "\"这是一行中文\"");
75 }
76
77 public void testShouldRecognizeDecimalIntegerLiteral() throws Exception {
78 assertRecognized(OctaneTokenTypes.DecimalIntegerLiteral, "0", "0");
79 assertRecognized(OctaneTokenTypes.DecimalIntegerLiteral, "-123", "-123");
80 }
81
82 public void testShouldRecognizeDecimalLongLiteral() throws Exception {
83 assertRecognized(OctaneTokenTypes.DecimalLongLiteral, "0", "0l");
84 assertRecognized(OctaneTokenTypes.DecimalLongLiteral, "-123", "-123L");
85 }
86
87 public void testShouldRecognizeHexIntegerLiteral() throws Exception {
88 assertRecognized(OctaneTokenTypes.HexIntegerLiteral, "1A3B", "+0x1A3B");
89 assertRecognized(OctaneTokenTypes.HexIntegerLiteral, "-1A3B", "-0x1A3B");
90 }
91
92 public void testShouldRecognizeHexLongLiteral() throws Exception {
93 assertRecognized(OctaneTokenTypes.HexLongLiteral, "1A3B", "+0x1A3BL");
94 assertRecognized(OctaneTokenTypes.HexLongLiteral, "-1A3F", "-0x1A3Fl");
95 }
96
97 public void testShouldRecognizeOctalIntegerLiteral() throws Exception {
98 assertRecognized(OctaneTokenTypes.OctalIntegerLiteral, "123", "+0123");
99 assertRecognized(OctaneTokenTypes.OctalIntegerLiteral, "-123", "-0123");
100 }
101
102 public void testShouldRecognizeOctalLongLiteral() throws Exception {
103 assertRecognized(OctaneTokenTypes.OctalLongLiteral, "1237", "+01237L");
104 assertRecognized(OctaneTokenTypes.OctalLongLiteral, "-1237", "-01237l");
105 }
106
107 public void testShouldRecognizeDoubleLiteral() throws Exception {
108 assertRecognized(OctaneTokenTypes.DoubleLiteral, "0.5", "+0.5");
109 assertRecognized(OctaneTokenTypes.DoubleLiteral, "-.5", "-.5");
110 assertRecognized(OctaneTokenTypes.DoubleLiteral, "0.5", "+0.5D");
111 assertRecognized(OctaneTokenTypes.DoubleLiteral, "-.5", "-.5d");
112 }
113
114 public void testShouldRecognizeDoubleLiteralInExponentialForm() throws Exception {
115 assertRecognized(OctaneTokenTypes.DoubleLiteral, "0.5e+10", "+0.5e+10");
116 assertRecognized(OctaneTokenTypes.DoubleLiteral, "-.5E-10", "-.5E-10");
117 assertRecognized(OctaneTokenTypes.DoubleLiteral, "0.5E+5", "+0.5E+5D");
118 assertRecognized(OctaneTokenTypes.DoubleLiteral, "-.5E-5", "-.5E-5d");
119 assertRecognized(OctaneTokenTypes.DoubleLiteral, "10E+5", "+10E+5d");
120 assertRecognized(OctaneTokenTypes.DoubleLiteral, "-10e-5", "-10e-5D");
121 }
122
123 public void testShouldRecognizeFloatLiteral() throws Exception {
124 assertRecognized(OctaneTokenTypes.FloatLiteral, "0.5", "+0.5F");
125 assertRecognized(OctaneTokenTypes.FloatLiteral, "-.5", "-.5f");
126 assertRecognized(OctaneTokenTypes.FloatLiteral, "10E+5", "+10E+5f");
127 assertRecognized(OctaneTokenTypes.FloatLiteral, "-10e-5", "-10e-5F");
128 }
129
130 public void testShouldRecognizeFloatLiteralInExponentialForm() throws Exception {
131 assertRecognized(OctaneTokenTypes.FloatLiteral, "0.5E+5", "+0.5E+5F");
132 assertRecognized(OctaneTokenTypes.FloatLiteral, "-.5e-5", "-.5e-5f");
133 }
134
135 protected void assertRecognized(int tokenType, String sourceString) throws Exception {
136 assertRecognized(tokenType, null, sourceString);
137
138 }
139
140 protected void assertRecognized(int tokenType, String exceptedText, String sourceString) throws Exception {
141 assertRecognized(new int[] { tokenType }, exceptedText == null ? null : new String[] { exceptedText }, sourceString);
142 }
143
144 protected void assertRecognized(int[] tokenTypes, String[] exceptedText, String sourceString) throws TokenStreamException {
145 TokenStream lexer = createLexer(sourceString);
146 for (int i = 0; i < tokenTypes.length; i++) {
147 Token token = lexer.nextToken();
148 assertEquals(tokenTypes[i], token.getType());
149 if (exceptedText != null) assertEquals(exceptedText[i], token.getText());
150 }
151 assertEquals(OctaneTokenTypes.EOF, lexer.nextToken().getType());
152 }