From 63053efd2a5ddf3cb63dd3f93d1d9ab760cbb379 Mon Sep 17 00:00:00 2001 From: Sam Harwell Date: Tue, 1 Jul 2014 17:28:30 -0500 Subject: [PATCH] Updated TestPerformance and associated code for the latest release --- .../Antlr4.Runtime.Test.Portable.csproj | 5 +- .../Antlr4.Runtime.Test.net20.csproj | 1 + .../Antlr4.Runtime.Test.net30.csproj | 1 + .../Antlr4.Runtime.Test.net35-client.csproj | 1 + .../Antlr4.Runtime.Test.net40-client.csproj | 1 + .../Antlr4.Runtime.Test.net45.csproj | 1 + .../CSharp/Antlr4.Runtime.Test/BaseTest.cs | 28 +- runtime/CSharp/Antlr4.Runtime.Test/Java-LR.g4 | 681 ++++++++++++----- runtime/CSharp/Antlr4.Runtime.Test/Java.g4 | 683 +++++++++++++----- .../JavaUnicodeInputStream.cs | 272 +++++++ .../Antlr4.Runtime.Test/TestPerformance.cs | 157 +++- 11 files changed, 1458 insertions(+), 373 deletions(-) create mode 100644 runtime/CSharp/Antlr4.Runtime.Test/JavaUnicodeInputStream.cs diff --git a/runtime/CSharp/Antlr4.Runtime.Test/Antlr4.Runtime.Test.Portable.csproj b/runtime/CSharp/Antlr4.Runtime.Test/Antlr4.Runtime.Test.Portable.csproj index 88f525bd6..9b90fb1ce 100644 --- a/runtime/CSharp/Antlr4.Runtime.Test/Antlr4.Runtime.Test.Portable.csproj +++ b/runtime/CSharp/Antlr4.Runtime.Test/Antlr4.Runtime.Test.Portable.csproj @@ -26,7 +26,7 @@ full false bin\Portable\Debug\ - DEBUG;TRACE;NET_3_5;NET_4_0 + DEBUG;TRACE;PORTABLE;NET40;NET40PLUS;NET35PLUS;NET30PLUS;NET20PLUS prompt 4 true @@ -35,7 +35,7 @@ pdbonly true bin\Portable\Release\ - TRACE;NET_3_5;NET_4_0 + TRACE;PORTABLE;NET40;NET40PLUS;NET35PLUS;NET30PLUS;NET20PLUS prompt 4 true @@ -57,6 +57,7 @@ + diff --git a/runtime/CSharp/Antlr4.Runtime.Test/Antlr4.Runtime.Test.net20.csproj b/runtime/CSharp/Antlr4.Runtime.Test/Antlr4.Runtime.Test.net20.csproj index 61d956120..d37112975 100644 --- a/runtime/CSharp/Antlr4.Runtime.Test/Antlr4.Runtime.Test.net20.csproj +++ b/runtime/CSharp/Antlr4.Runtime.Test/Antlr4.Runtime.Test.net20.csproj @@ -57,6 +57,7 @@ + diff --git a/runtime/CSharp/Antlr4.Runtime.Test/Antlr4.Runtime.Test.net30.csproj b/runtime/CSharp/Antlr4.Runtime.Test/Antlr4.Runtime.Test.net30.csproj index ba625ad89..6c03a427a 100644 --- a/runtime/CSharp/Antlr4.Runtime.Test/Antlr4.Runtime.Test.net30.csproj +++ b/runtime/CSharp/Antlr4.Runtime.Test/Antlr4.Runtime.Test.net30.csproj @@ -57,6 +57,7 @@ + diff --git a/runtime/CSharp/Antlr4.Runtime.Test/Antlr4.Runtime.Test.net35-client.csproj b/runtime/CSharp/Antlr4.Runtime.Test/Antlr4.Runtime.Test.net35-client.csproj index cddaa77f0..23396a24c 100644 --- a/runtime/CSharp/Antlr4.Runtime.Test/Antlr4.Runtime.Test.net35-client.csproj +++ b/runtime/CSharp/Antlr4.Runtime.Test/Antlr4.Runtime.Test.net35-client.csproj @@ -57,6 +57,7 @@ + diff --git a/runtime/CSharp/Antlr4.Runtime.Test/Antlr4.Runtime.Test.net40-client.csproj b/runtime/CSharp/Antlr4.Runtime.Test/Antlr4.Runtime.Test.net40-client.csproj index 3ee96696a..22245fe23 100644 --- a/runtime/CSharp/Antlr4.Runtime.Test/Antlr4.Runtime.Test.net40-client.csproj +++ b/runtime/CSharp/Antlr4.Runtime.Test/Antlr4.Runtime.Test.net40-client.csproj @@ -57,6 +57,7 @@ + diff --git a/runtime/CSharp/Antlr4.Runtime.Test/Antlr4.Runtime.Test.net45.csproj b/runtime/CSharp/Antlr4.Runtime.Test/Antlr4.Runtime.Test.net45.csproj index 455616915..737a733c5 100644 --- a/runtime/CSharp/Antlr4.Runtime.Test/Antlr4.Runtime.Test.net45.csproj +++ b/runtime/CSharp/Antlr4.Runtime.Test/Antlr4.Runtime.Test.net45.csproj @@ -57,6 +57,7 @@ + diff --git a/runtime/CSharp/Antlr4.Runtime.Test/BaseTest.cs b/runtime/CSharp/Antlr4.Runtime.Test/BaseTest.cs index 186e7215c..df9dbd717 100644 --- a/runtime/CSharp/Antlr4.Runtime.Test/BaseTest.cs +++ b/runtime/CSharp/Antlr4.Runtime.Test/BaseTest.cs @@ -268,11 +268,11 @@ string compiler = PathCombine(JavaHome, "bin", "java.exe"); List classpath = new List(); - classpath.Add(GetMavenArtifact("com.tunnelvisionlabs", "antlr4-csharp", "4.0.1-SNAPSHOT")); - classpath.Add(GetMavenArtifact("com.tunnelvisionlabs", "antlr4-runtime", "4.0.1-SNAPSHOT")); - classpath.Add(GetMavenArtifact("com.tunnelvisionlabs", "antlr4", "4.0.1-SNAPSHOT")); - classpath.Add(GetMavenArtifact("org.antlr", "antlr-runtime", "3.5")); - classpath.Add(GetMavenArtifact("org.antlr", "ST4", "4.0.7")); + classpath.Add(GetMavenArtifact("com.tunnelvisionlabs", "antlr4-csharp", "4.3-SNAPSHOT")); + classpath.Add(GetMavenArtifact("com.tunnelvisionlabs", "antlr4-runtime", "4.3")); + classpath.Add(GetMavenArtifact("com.tunnelvisionlabs", "antlr4", "4.3")); + classpath.Add(GetMavenArtifact("org.antlr", "antlr-runtime", "3.5.2")); + classpath.Add(GetMavenArtifact("org.antlr", "ST4", "4.0.8")); List options = new List(); options.Add("-cp"); @@ -284,7 +284,23 @@ options.Add(tmpdir); options.Add("-lib"); options.Add(tmpdir); - options.Add("-Dlanguage=CSharp"); + +#if PORTABLE + options.Add("-Dlanguage=CSharp_v4_5"); +#elif NET45 + options.Add("-Dlanguage=CSharp_v4_5"); +#elif NET40 + options.Add("-Dlanguage=CSharp_v4_0"); +#elif NET35 + options.Add("-Dlanguage=CSharp_v3_5"); +#elif NET30 + options.Add("-Dlanguage=CSharp_v3_0"); +#elif NET20 + options.Add("-Dlanguage=CSharp_v2_0"); +#else +#error Unknown assembly. +#endif + options.Add(grammarFileName); System.Diagnostics.Process process = System.Diagnostics.Process.Start(new System.Diagnostics.ProcessStartInfo(compiler, '"' + Utils.Join("\" \"", options) + '"') diff --git a/runtime/CSharp/Antlr4.Runtime.Test/Java-LR.g4 b/runtime/CSharp/Antlr4.Runtime.Test/Java-LR.g4 index 4c4635d10..9c088a800 100644 --- a/runtime/CSharp/Antlr4.Runtime.Test/Java-LR.g4 +++ b/runtime/CSharp/Antlr4.Runtime.Test/Java-LR.g4 @@ -171,14 +171,48 @@ grammar Java; @lexer::members { - protected const int EOF = Eof; - protected const int HIDDEN = Hidden; - protected bool enumIsKeyword = true; - protected bool assertIsKeyword = true; + +private static bool IsJavaIdentifierCharacter(char c, bool start) +{ + switch (char.GetUnicodeCategory(c)) + { + case System.Globalization.UnicodeCategory.UppercaseLetter: + case System.Globalization.UnicodeCategory.LowercaseLetter: + case System.Globalization.UnicodeCategory.TitlecaseLetter: + case System.Globalization.UnicodeCategory.ModifierLetter: + case System.Globalization.UnicodeCategory.OtherLetter: + // isLetter returns true + return true; + + case System.Globalization.UnicodeCategory.LetterNumber: + // getType returns LETTER_NUMBER + return true; + + case System.Globalization.UnicodeCategory.CurrencySymbol: + // a currency symbol (such as "$") + return true; + + case System.Globalization.UnicodeCategory.ConnectorPunctuation: + // a connecting punctuation character (such as "_") + return true; + + case System.Globalization.UnicodeCategory.DecimalDigitNumber: + // it is a digit + return !start; + + case System.Globalization.UnicodeCategory.SpacingCombiningMark: + // it is a combining mark + return !start; + + case System.Globalization.UnicodeCategory.NonSpacingMark: + // it is a non-spacing mark + return !start; + + default: + return false; + } } -@parser::members { - protected const int EOF = Eof; } // starting point for parsing a java file @@ -216,14 +250,15 @@ classOrInterfaceModifiers ; classOrInterfaceModifier - : annotation // class or interface - | 'public' // class or interface - | 'protected' // class or interface - | 'private' // class or interface - | 'abstract' // class or interface - | 'static' // class or interface - | 'final' // class only -- does not apply to interfaces - | 'strictfp' // class or interface + : annotation // class or interface + | ( 'public' // class or interface + | 'protected' // class or interface + | 'private' // class or interface + | 'abstract' // class or interface + | 'static' // class or interface + | 'final' // class only -- does not apply to interfaces + | 'strictfp' // class or interface + ) ; modifiers @@ -420,17 +455,18 @@ arrayInitializer modifier : annotation - | 'public' - | 'protected' - | 'private' - | 'static' - | 'abstract' - | 'final' - | 'native' - | 'synchronized' - | 'transient' - | 'volatile' - | 'strictfp' + | ( 'public' + | 'protected' + | 'private' + | 'static' + | 'abstract' + | 'final' + | 'native' + | 'synchronized' + | 'transient' + | 'volatile' + | 'strictfp' + ) ; packageOrTypeName @@ -501,39 +537,22 @@ methodBody ; constructorBody - : '{' explicitConstructorInvocation? blockStatement* '}' + : block ; -explicitConstructorInvocation - : nonWildcardTypeArguments? ('this' | 'super') arguments ';' - | primary '.' nonWildcardTypeArguments? 'super' arguments ';' - ; - - qualifiedName : Identifier ('.' Identifier)* ; literal - : integerLiteral + : IntegerLiteral | FloatingPointLiteral | CharacterLiteral | StringLiteral - | booleanLiteral + | BooleanLiteral | 'null' ; -integerLiteral - : HexLiteral - | OctalLiteral - | DecimalLiteral - ; - -booleanLiteral - : 'true' - | 'false' - ; - // ANNOTATIONS annotations @@ -576,6 +595,7 @@ annotationTypeBody annotationTypeElementDeclaration : modifiers annotationTypeElementRest + | ';' // this is not allowed by the grammar, but apparently allowed by the actual compiler ; annotationTypeElementRest @@ -628,18 +648,14 @@ variableModifiers ; statement -@leftfactor{catches} : block | ASSERT expression (':' expression)? ';' | 'if' parExpression statement ('else' statement)? | 'for' '(' forControl ')' statement | 'while' parExpression statement | 'do' statement 'while' parExpression ';' - | 'try' block - ( catches 'finally' block - | catches - | 'finally' block - ) + | 'try' block (catches finallyBlock? | finallyBlock) + | 'try' resourceSpecification block catches? finallyBlock? | 'switch' parExpression '{' switchBlockStatementGroups '}' | 'synchronized' parExpression block | 'return' expression? ';' @@ -650,15 +666,35 @@ statement | statementExpression ';' | Identifier ':' statement ; - + catches - : catchClause (catchClause)* + : catchClause+ ; - + catchClause - : 'catch' '(' formalParameter ')' block + : 'catch' '(' variableModifiers catchType Identifier ')' block ; +catchType + : qualifiedName ('|' qualifiedName)* + ; + +finallyBlock + : 'finally' block + ; + +resourceSpecification + : '(' resources ';'? ')' + ; + +resources + : resource (';' resource)* + ; + +resource + : variableModifiers classOrInterfaceType variableDeclaratorId '=' expression + ; + formalParameter : variableModifiers type variableDeclaratorId ; @@ -721,9 +757,8 @@ expression : primary | expression '.' Identifier | expression '.' 'this' - | expression '.' 'super' '(' expressionList? ')' - | expression '.' 'new' Identifier '(' expressionList? ')' - | expression '.' 'super' '.' Identifier arguments? + | expression '.' 'new' nonWildcardTypeArguments? innerCreator + | expression '.' 'super' superSuffix | expression '.' explicitGenericInvocation | 'new' creator | expression '[' expression ']' @@ -735,28 +770,28 @@ expression | expression ('*'|'/'|'%') expression | expression ('+'|'-') expression | expression ('<' '<' | '>' '>' '>' | '>' '>') expression - | expression ('<' '=' | '>' '=' | '>' | '<') expression + | expression ('<=' | '>=' | '>' | '<') expression | expression 'instanceof' type | expression ('==' | '!=') expression | expression '&' expression - | expression '^' expression + | expression '^' expression | expression '|' expression | expression '&&' expression | expression '||' expression | expression '?' expression ':' expression - | expression - ('=' - | '+=' - | '-=' - | '*=' - | '/=' - | '&=' - | '|=' - | '^=' - | '>' '>' '=' - | '>' '>' '>' '=' - | '<' '<' '=' - | '%=' + | expression + ( '=' + | '+=' + | '-=' + | '*=' + | '/=' + | '&=' + | '|=' + | '^=' + | '>>=' + | '>>>=' + | '<<=' + | '%=' ) expression ; @@ -769,6 +804,7 @@ primary | Identifier | type '.' 'class' | 'void' '.' 'class' + | nonWildcardTypeArguments (explicitGenericInvocationSuffix | 'this' arguments) ; creator @@ -777,12 +813,12 @@ creator ; createdName - : classOrInterfaceType + : Identifier typeArgumentsOrDiamond? ('.' Identifier typeArgumentsOrDiamond?)* | primitiveType ; innerCreator - : nonWildcardTypeArguments? Identifier classCreatorRest + : Identifier nonWildcardTypeArgumentsOrDiamond? classCreatorRest ; arrayCreatorRest @@ -797,148 +833,461 @@ classCreatorRest ; explicitGenericInvocation - : nonWildcardTypeArguments Identifier arguments + : nonWildcardTypeArguments explicitGenericInvocationSuffix ; nonWildcardTypeArguments : '<' typeList '>' ; - -selector - : '.' Identifier arguments? - | '.' 'this' - | '.' 'super' superSuffix - | '.' 'new' innerCreator - | '[' expression ']' - ; - + +typeArgumentsOrDiamond + : '<' '>' + | typeArguments + ; + +nonWildcardTypeArgumentsOrDiamond + : '<' '>' + | nonWildcardTypeArguments + ; + superSuffix : arguments | '.' Identifier arguments? ; +explicitGenericInvocationSuffix + : 'super' superSuffix + | Identifier arguments + ; + arguments : '(' expressionList? ')' ; // LEXER -HexLiteral : '0' ('x'|'X') HexDigit+ IntegerTypeSuffix? ; +// §3.9 Keywords -DecimalLiteral : ('0' | '1'..'9' '0'..'9'*) IntegerTypeSuffix? ; +ABSTRACT : 'abstract'; +ASSERT : 'assert'; +BOOLEAN : 'boolean'; +BREAK : 'break'; +BYTE : 'byte'; +CASE : 'case'; +CATCH : 'catch'; +CHAR : 'char'; +CLASS : 'class'; +CONST : 'const'; +CONTINUE : 'continue'; +DEFAULT : 'default'; +DO : 'do'; +DOUBLE : 'double'; +ELSE : 'else'; +ENUM : 'enum'; +EXTENDS : 'extends'; +FINAL : 'final'; +FINALLY : 'finally'; +FLOAT : 'float'; +FOR : 'for'; +IF : 'if'; +GOTO : 'goto'; +IMPLEMENTS : 'implements'; +IMPORT : 'import'; +INSTANCEOF : 'instanceof'; +INT : 'int'; +INTERFACE : 'interface'; +LONG : 'long'; +NATIVE : 'native'; +NEW : 'new'; +PACKAGE : 'package'; +PRIVATE : 'private'; +PROTECTED : 'protected'; +PUBLIC : 'public'; +RETURN : 'return'; +SHORT : 'short'; +STATIC : 'static'; +STRICTFP : 'strictfp'; +SUPER : 'super'; +SWITCH : 'switch'; +SYNCHRONIZED : 'synchronized'; +THIS : 'this'; +THROW : 'throw'; +THROWS : 'throws'; +TRANSIENT : 'transient'; +TRY : 'try'; +VOID : 'void'; +VOLATILE : 'volatile'; +WHILE : 'while'; -OctalLiteral : '0' ('0'..'7')+ IntegerTypeSuffix? ; +// §3.10.1 Integer Literals + +IntegerLiteral + : DecimalIntegerLiteral + | HexIntegerLiteral + | OctalIntegerLiteral + | BinaryIntegerLiteral + ; fragment -HexDigit : ('0'..'9'|'a'..'f'|'A'..'F') ; +DecimalIntegerLiteral + : DecimalNumeral IntegerTypeSuffix? + ; fragment -IntegerTypeSuffix : ('l'|'L') ; +HexIntegerLiteral + : HexNumeral IntegerTypeSuffix? + ; + +fragment +OctalIntegerLiteral + : OctalNumeral IntegerTypeSuffix? + ; + +fragment +BinaryIntegerLiteral + : BinaryNumeral IntegerTypeSuffix? + ; + +fragment +IntegerTypeSuffix + : [lL] + ; + +fragment +DecimalNumeral + : '0' + | NonZeroDigit (Digits? | Underscores Digits) + ; + +fragment +Digits + : Digit (DigitsAndUnderscores? Digit)? + ; + +fragment +Digit + : '0' + | NonZeroDigit + ; + +fragment +NonZeroDigit + : [1-9] + ; + +fragment +DigitsAndUnderscores + : DigitOrUnderscore+ + ; + +fragment +DigitOrUnderscore + : Digit + | '_' + ; + +fragment +Underscores + : '_'+ + ; + +fragment +HexNumeral + : '0' [xX] HexDigits + ; + +fragment +HexDigits + : HexDigit (HexDigitsAndUnderscores? HexDigit)? + ; + +fragment +HexDigit + : [0-9a-fA-F] + ; + +fragment +HexDigitsAndUnderscores + : HexDigitOrUnderscore+ + ; + +fragment +HexDigitOrUnderscore + : HexDigit + | '_' + ; + +fragment +OctalNumeral + : '0' Underscores? OctalDigits + ; + +fragment +OctalDigits + : OctalDigit (OctalDigitsAndUnderscores? OctalDigit)? + ; + +fragment +OctalDigit + : [0-7] + ; + +fragment +OctalDigitsAndUnderscores + : OctalDigitOrUnderscore+ + ; + +fragment +OctalDigitOrUnderscore + : OctalDigit + | '_' + ; + +fragment +BinaryNumeral + : '0' [bB] BinaryDigits + ; + +fragment +BinaryDigits + : BinaryDigit (BinaryDigitsAndUnderscores? BinaryDigit)? + ; + +fragment +BinaryDigit + : [01] + ; + +fragment +BinaryDigitsAndUnderscores + : BinaryDigitOrUnderscore+ + ; + +fragment +BinaryDigitOrUnderscore + : BinaryDigit + | '_' + ; + +// §3.10.2 Floating-Point Literals FloatingPointLiteral - : ('0'..'9')+ '.' ('0'..'9')* Exponent? FloatTypeSuffix? - | '.' ('0'..'9')+ Exponent? FloatTypeSuffix? - | ('0'..'9')+ Exponent FloatTypeSuffix? - | ('0'..'9')+ FloatTypeSuffix - | '0' ('x'|'X') - ( HexDigit+ ('.' HexDigit*)? HexExponent FloatTypeSuffix? - | '.' HexDigit+ HexExponent FloatTypeSuffix? - ) - ; + : DecimalFloatingPointLiteral + | HexadecimalFloatingPointLiteral + ; fragment -Exponent : ('e'|'E') ('+'|'-')? ('0'..'9')+ ; +DecimalFloatingPointLiteral + : Digits '.' Digits? ExponentPart? FloatTypeSuffix? + | '.' Digits ExponentPart? FloatTypeSuffix? + | Digits ExponentPart FloatTypeSuffix? + | Digits FloatTypeSuffix + ; fragment -HexExponent : ('p'|'P') ('+'|'-')? ('0'..'9')+ ; +ExponentPart + : ExponentIndicator SignedInteger + ; fragment -FloatTypeSuffix : ('f'|'F'|'d'|'D') ; +ExponentIndicator + : [eE] + ; + +fragment +SignedInteger + : Sign? Digits + ; + +fragment +Sign + : [+-] + ; + +fragment +FloatTypeSuffix + : [fFdD] + ; + +fragment +HexadecimalFloatingPointLiteral + : HexSignificand BinaryExponent FloatTypeSuffix? + ; + +fragment +HexSignificand + : HexNumeral '.'? + | '0' [xX] HexDigits? '.' HexDigits + ; + +fragment +BinaryExponent + : BinaryExponentIndicator SignedInteger + ; + +fragment +BinaryExponentIndicator + : [pP] + ; + +// §3.10.3 Boolean Literals + +BooleanLiteral + : 'true' + | 'false' + ; + +// §3.10.4 Character Literals CharacterLiteral - : '\'' ( EscapeSequence | ~('\''|'\\') ) '\'' - ; + : '\'' SingleCharacter '\'' + | '\'' EscapeSequence '\'' + ; + +fragment +SingleCharacter + : ~['\\] + ; + +// §3.10.5 String Literals StringLiteral - : '"' ( EscapeSequence | ~('\\'|'"') )* '"' - ; + : '"' StringCharacters? '"' + ; + +fragment +StringCharacters + : StringCharacter+ + ; + +fragment +StringCharacter + : ~["\\] + | EscapeSequence + ; + +// §3.10.6 Escape Sequences for Character and String Literals fragment EscapeSequence - : '\\' ('b'|'t'|'n'|'f'|'r'|'\"'|'\''|'\\') - | UnicodeEscape - | OctalEscape - ; + : '\\' [btnfr"'\\] + | OctalEscape + ; fragment OctalEscape - : '\\' ('0'..'3') ('0'..'7') ('0'..'7') - | '\\' ('0'..'7') ('0'..'7') - | '\\' ('0'..'7') - ; + : '\\' OctalDigit + | '\\' OctalDigit OctalDigit + | '\\' ZeroToThree OctalDigit OctalDigit + ; fragment -UnicodeEscape - : '\\' 'u' HexDigit HexDigit HexDigit HexDigit - ; +ZeroToThree + : [0-3] + ; -ENUM: 'enum' {enumIsKeyword}? - ; - -ASSERT - : 'assert' {assertIsKeyword}? - ; - -Identifier - : Letter (Letter|JavaIDDigit)* - ; +// §3.10.7 The Null Literal -/**I found this char range in JavaCC's grammar, but Letter and Digit overlap. - Still works, but... - */ -fragment -Letter - : '\u0024' | - '\u0041'..'\u005a' | - '\u005f' | - '\u0061'..'\u007a' | - '\u00c0'..'\u00d6' | - '\u00d8'..'\u00f6' | - '\u00f8'..'\u00ff' | - '\u0100'..'\u1fff' | - '\u3040'..'\u318f' | - '\u3300'..'\u337f' | - '\u3400'..'\u3d2d' | - '\u4e00'..'\u9fff' | - '\uf900'..'\ufaff' - ; +NullLiteral + : 'null' + ; + +// §3.11 Separators + +LPAREN : '('; +RPAREN : ')'; +LBRACE : '{'; +RBRACE : '}'; +LBRACK : '['; +RBRACK : ']'; +SEMI : ';'; +COMMA : ','; +DOT : '.'; + +// §3.12 Operators + +ASSIGN : '='; +GT : '>'; +LT : '<'; +BANG : '!'; +TILDE : '~'; +QUESTION : '?'; +COLON : ':'; +EQUAL : '=='; +LE : '<='; +GE : '>='; +NOTEQUAL : '!='; +AND : '&&'; +OR : '||'; +INC : '++'; +DEC : '--'; +ADD : '+'; +SUB : '-'; +MUL : '*'; +DIV : '/'; +BITAND : '&'; +BITOR : '|'; +CARET : '^'; +MOD : '%'; + +ADD_ASSIGN : '+='; +SUB_ASSIGN : '-='; +MUL_ASSIGN : '*='; +DIV_ASSIGN : '/='; +AND_ASSIGN : '&='; +OR_ASSIGN : '|='; +XOR_ASSIGN : '^='; +MOD_ASSIGN : '%='; +LSHIFT_ASSIGN : '<<='; +RSHIFT_ASSIGN : '>>='; +URSHIFT_ASSIGN : '>>>='; + +// §3.8 Identifiers (must appear after all keywords in the grammar) + +Identifier + : JavaLetter JavaLetterOrDigit* + ; fragment -JavaIDDigit - : '\u0030'..'\u0039' | - '\u0660'..'\u0669' | - '\u06f0'..'\u06f9' | - '\u0966'..'\u096f' | - '\u09e6'..'\u09ef' | - '\u0a66'..'\u0a6f' | - '\u0ae6'..'\u0aef' | - '\u0b66'..'\u0b6f' | - '\u0be7'..'\u0bef' | - '\u0c66'..'\u0c6f' | - '\u0ce6'..'\u0cef' | - '\u0d66'..'\u0d6f' | - '\u0e50'..'\u0e59' | - '\u0ed0'..'\u0ed9' | - '\u1040'..'\u1049' - ; +JavaLetter + : [a-zA-Z$_] // these are the "java letters" below 0xFF + | // covers all characters above 0xFF which are not a surrogate + ~[\u0000-\u00FF\uD800-\uDBFF] + {IsJavaIdentifierCharacter((char)_input.La(-1), true)}? + //| // covers UTF-16 surrogate pairs encodings for U+10000 to U+10FFFF + // [\uD800-\uDBFF] [\uDC00-\uDFFF] + // {Character.isJavaIdentifierStart(Character.toCodePoint((char)_input.La(-2), (char)_input.La(-1)))}? + ; -WS : (' '|'\r'|'\t'|'\u000C'|'\n')+ -> channel(HIDDEN) +fragment +JavaLetterOrDigit + : [a-zA-Z0-9$_] // these are the "java letters or digits" below 0xFF + | // covers all characters above 0xFF which are not a surrogate + ~[\u0000-\u00FF\uD800-\uDBFF] + {IsJavaIdentifierCharacter((char)_input.La(-1), false)}? + //| // covers UTF-16 surrogate pairs encodings for U+10000 to U+10FFFF + // [\uD800-\uDBFF] [\uDC00-\uDFFF] + // {Character.isJavaIdentifierPart(Character.toCodePoint((char)_input.La(-2), (char)_input.La(-1)))}? + ; + +// +// Additional symbols not defined in the lexical specification +// + +AT : '@'; +ELLIPSIS : '...'; + +// +// Whitespace and comments +// + +WS : [ \t\r\n\u000C]+ -> skip ; COMMENT - : '/*' .*? '*/' -> channel(HIDDEN) + : '/*' .*? '*/' -> skip ; LINE_COMMENT - : '//' ~('\n'|'\r')* '\r'? '\n' -> channel(HIDDEN) + : '//' ~[\r\n]* -> skip ; diff --git a/runtime/CSharp/Antlr4.Runtime.Test/Java.g4 b/runtime/CSharp/Antlr4.Runtime.Test/Java.g4 index 61ac0db38..26b0e9b2e 100644 --- a/runtime/CSharp/Antlr4.Runtime.Test/Java.g4 +++ b/runtime/CSharp/Antlr4.Runtime.Test/Java.g4 @@ -168,14 +168,48 @@ grammar Java; @lexer::members { - protected const int EOF = Eof; - protected const int HIDDEN = Hidden; - protected bool enumIsKeyword = true; - protected bool assertIsKeyword = true; + +private static bool IsJavaIdentifierCharacter(char c, bool start) +{ + switch (char.GetUnicodeCategory(c)) + { + case System.Globalization.UnicodeCategory.UppercaseLetter: + case System.Globalization.UnicodeCategory.LowercaseLetter: + case System.Globalization.UnicodeCategory.TitlecaseLetter: + case System.Globalization.UnicodeCategory.ModifierLetter: + case System.Globalization.UnicodeCategory.OtherLetter: + // isLetter returns true + return true; + + case System.Globalization.UnicodeCategory.LetterNumber: + // getType returns LETTER_NUMBER + return true; + + case System.Globalization.UnicodeCategory.CurrencySymbol: + // a currency symbol (such as "$") + return true; + + case System.Globalization.UnicodeCategory.ConnectorPunctuation: + // a connecting punctuation character (such as "_") + return true; + + case System.Globalization.UnicodeCategory.DecimalDigitNumber: + // it is a digit + return !start; + + case System.Globalization.UnicodeCategory.SpacingCombiningMark: + // it is a combining mark + return !start; + + case System.Globalization.UnicodeCategory.NonSpacingMark: + // it is a non-spacing mark + return !start; + + default: + return false; + } } -@parser::members { - protected const int EOF = Eof; } // starting point for parsing a java file @@ -213,14 +247,15 @@ classOrInterfaceModifiers ; classOrInterfaceModifier - : annotation // class or interface - | 'public' // class or interface - | 'protected' // class or interface - | 'private' // class or interface - | 'abstract' // class or interface - | 'static' // class or interface - | 'final' // class only -- does not apply to interfaces - | 'strictfp' // class or interface + : annotation // class or interface + | ( 'public' // class or interface + | 'protected' // class or interface + | 'private' // class or interface + | 'abstract' // class or interface + | 'static' // class or interface + | 'final' // class only -- does not apply to interfaces + | 'strictfp' // class or interface + ) ; modifiers @@ -417,17 +452,18 @@ arrayInitializer modifier : annotation - | 'public' - | 'protected' - | 'private' - | 'static' - | 'abstract' - | 'final' - | 'native' - | 'synchronized' - | 'transient' - | 'volatile' - | 'strictfp' + | ( 'public' + | 'protected' + | 'private' + | 'static' + | 'abstract' + | 'final' + | 'native' + | 'synchronized' + | 'transient' + | 'volatile' + | 'strictfp' + ) ; packageOrTypeName @@ -498,39 +534,22 @@ methodBody ; constructorBody - : '{' explicitConstructorInvocation? blockStatement* '}' + : block ; -explicitConstructorInvocation - : nonWildcardTypeArguments? ('this' | 'super') arguments ';' - | primary '.' nonWildcardTypeArguments? 'super' arguments ';' - ; - - qualifiedName : Identifier ('.' Identifier)* ; literal - : integerLiteral + : IntegerLiteral | FloatingPointLiteral | CharacterLiteral | StringLiteral - | booleanLiteral + | BooleanLiteral | 'null' ; -integerLiteral - : HexLiteral - | OctalLiteral - | DecimalLiteral - ; - -booleanLiteral - : 'true' - | 'false' - ; - // ANNOTATIONS annotations @@ -573,6 +592,7 @@ annotationTypeBody annotationTypeElementDeclaration : modifiers annotationTypeElementRest + | ';' // this is not allowed by the grammar, but apparently allowed by the actual compiler ; annotationTypeElementRest @@ -625,18 +645,14 @@ variableModifiers ; statement -//@leftfactor{catches} - : block + : block | ASSERT expression (':' expression)? ';' | 'if' parExpression statement ('else' statement)? | 'for' '(' forControl ')' statement | 'while' parExpression statement | 'do' statement 'while' parExpression ';' - | 'try' block - ( catches 'finally' block - | catches - | 'finally' block - ) + | 'try' block (catches finallyBlock? | finallyBlock) + | 'try' resourceSpecification block catches? finallyBlock? | 'switch' parExpression '{' switchBlockStatementGroups '}' | 'synchronized' parExpression block | 'return' expression? ';' @@ -647,15 +663,35 @@ statement | statementExpression ';' | Identifier ':' statement ; - + catches - : catchClause (catchClause)* + : catchClause+ ; catchClause - : 'catch' '(' formalParameter ')' block + : 'catch' '(' variableModifiers catchType Identifier ')' block ; +catchType + : qualifiedName ('|' qualifiedName)* + ; + +finallyBlock + : 'finally' block + ; + +resourceSpecification + : '(' resources ';'? ')' + ; + +resources + : resource (';' resource)* + ; + +resource + : variableModifiers classOrInterfaceType variableDeclaratorId '=' expression + ; + formalParameter : variableModifiers type variableDeclaratorId ; @@ -728,27 +764,13 @@ assignmentOperator | '|=' | '^=' | '%=' - | t1='<' t2='<' t3='=' -// { $t1.getLine() == $t2.getLine() && -// $t1.getCharPositionInLine() + 1 == $t2.getCharPositionInLine() && -// $t2.getLine() == $t3.getLine() && -// $t2.getCharPositionInLine() + 1 == $t3.getCharPositionInLine() }? - | t1='>' t2='>' t3='>' t4='=' -// { $t1.getLine() == $t2.getLine() && -// $t1.getCharPositionInLine() + 1 == $t2.getCharPositionInLine() && -// $t2.getLine() == $t3.getLine() && -// $t2.getCharPositionInLine() + 1 == $t3.getCharPositionInLine() && -// $t3.getLine() == $t4.getLine() && -// $t3.getCharPositionInLine() + 1 == $t4.getCharPositionInLine() }? - | t1='>' t2='>' t3='=' -// { $t1.getLine() == $t2.getLine() && -// $t1.getCharPositionInLine() + 1 == $t2.getCharPositionInLine() && -// $t2.getLine() == $t3.getLine() && -// $t2.getCharPositionInLine() + 1 == $t3.getCharPositionInLine() }? + | '<<=' + | '>>=' + | '>>>=' ; conditionalExpression - : conditionalOrExpression ( '?' conditionalExpression ':' conditionalExpression )? + : conditionalOrExpression ( '?' expression ':' conditionalExpression )? ; conditionalOrExpression @@ -784,14 +806,10 @@ relationalExpression ; relationalOp - : t1='<' t2='=' -// { $t1.getLine() == $t2.getLine() && -// $t1.getCharPositionInLine() + 1 == $t2.getCharPositionInLine() }? - | t1='>' t2='=' -// { $t1.getLine() == $t2.getLine() && -// $t1.getCharPositionInLine() + 1 == $t2.getCharPositionInLine() }? - | '<' - | '>' + : '<=' + | '>=' + | '<' + | '>' ; shiftExpression @@ -843,10 +861,11 @@ castExpression primary : parExpression - | 'this' ('.' Identifier)* identifierSuffix? + | 'this' arguments? | 'super' superSuffix | literal | 'new' creator + | nonWildcardTypeArguments (explicitGenericInvocationSuffix | 'this' arguments) | Identifier ('.' Identifier)* identifierSuffix? | primitiveType ('[' ']')* '.' 'class' | 'void' '.' 'class' @@ -854,13 +873,13 @@ primary identifierSuffix : ('[' ']')+ '.' 'class' - | ('[' expression ']')+ // can also be matched by selector, but do here + | '[' expression ']' | arguments | '.' 'class' | '.' explicitGenericInvocation | '.' 'this' | '.' 'super' arguments - | '.' 'new' innerCreator + | '.' 'new' nonWildcardTypeArguments? innerCreator ; creator @@ -869,12 +888,12 @@ creator ; createdName - : classOrInterfaceType - | primitiveType + : Identifier typeArgumentsOrDiamond? ('.' Identifier typeArgumentsOrDiamond?)* + | primitiveType ; innerCreator - : nonWildcardTypeArguments? Identifier classCreatorRest + : Identifier nonWildcardTypeArgumentsOrDiamond? classCreatorRest ; arrayCreatorRest @@ -889,18 +908,29 @@ classCreatorRest ; explicitGenericInvocation - : nonWildcardTypeArguments Identifier arguments + : nonWildcardTypeArguments explicitGenericInvocationSuffix ; nonWildcardTypeArguments : '<' typeList '>' ; - + +typeArgumentsOrDiamond + : '<' '>' + | typeArguments + ; + +nonWildcardTypeArgumentsOrDiamond + : '<' '>' + | nonWildcardTypeArguments + ; + selector : '.' Identifier arguments? + | '.' explicitGenericInvocation | '.' 'this' | '.' 'super' superSuffix - | '.' 'new' innerCreator + | '.' 'new' nonWildcardTypeArguments? innerCreator | '[' expression ']' ; @@ -909,128 +939,439 @@ superSuffix | '.' Identifier arguments? ; +explicitGenericInvocationSuffix + : 'super' superSuffix + | Identifier arguments + ; + arguments : '(' expressionList? ')' ; // LEXER -HexLiteral : '0' ('x'|'X') HexDigit+ IntegerTypeSuffix? ; +// §3.9 Keywords -DecimalLiteral : ('0' | '1'..'9' '0'..'9'*) IntegerTypeSuffix? ; +ABSTRACT : 'abstract'; +ASSERT : 'assert'; +BOOLEAN : 'boolean'; +BREAK : 'break'; +BYTE : 'byte'; +CASE : 'case'; +CATCH : 'catch'; +CHAR : 'char'; +CLASS : 'class'; +CONST : 'const'; +CONTINUE : 'continue'; +DEFAULT : 'default'; +DO : 'do'; +DOUBLE : 'double'; +ELSE : 'else'; +ENUM : 'enum'; +EXTENDS : 'extends'; +FINAL : 'final'; +FINALLY : 'finally'; +FLOAT : 'float'; +FOR : 'for'; +IF : 'if'; +GOTO : 'goto'; +IMPLEMENTS : 'implements'; +IMPORT : 'import'; +INSTANCEOF : 'instanceof'; +INT : 'int'; +INTERFACE : 'interface'; +LONG : 'long'; +NATIVE : 'native'; +NEW : 'new'; +PACKAGE : 'package'; +PRIVATE : 'private'; +PROTECTED : 'protected'; +PUBLIC : 'public'; +RETURN : 'return'; +SHORT : 'short'; +STATIC : 'static'; +STRICTFP : 'strictfp'; +SUPER : 'super'; +SWITCH : 'switch'; +SYNCHRONIZED : 'synchronized'; +THIS : 'this'; +THROW : 'throw'; +THROWS : 'throws'; +TRANSIENT : 'transient'; +TRY : 'try'; +VOID : 'void'; +VOLATILE : 'volatile'; +WHILE : 'while'; -OctalLiteral : '0' ('0'..'7')+ IntegerTypeSuffix? ; +// §3.10.1 Integer Literals + +IntegerLiteral + : DecimalIntegerLiteral + | HexIntegerLiteral + | OctalIntegerLiteral + | BinaryIntegerLiteral + ; fragment -HexDigit : ('0'..'9'|'a'..'f'|'A'..'F') ; +DecimalIntegerLiteral + : DecimalNumeral IntegerTypeSuffix? + ; fragment -IntegerTypeSuffix : ('l'|'L') ; +HexIntegerLiteral + : HexNumeral IntegerTypeSuffix? + ; + +fragment +OctalIntegerLiteral + : OctalNumeral IntegerTypeSuffix? + ; + +fragment +BinaryIntegerLiteral + : BinaryNumeral IntegerTypeSuffix? + ; + +fragment +IntegerTypeSuffix + : [lL] + ; + +fragment +DecimalNumeral + : '0' + | NonZeroDigit (Digits? | Underscores Digits) + ; + +fragment +Digits + : Digit (DigitsAndUnderscores? Digit)? + ; + +fragment +Digit + : '0' + | NonZeroDigit + ; + +fragment +NonZeroDigit + : [1-9] + ; + +fragment +DigitsAndUnderscores + : DigitOrUnderscore+ + ; + +fragment +DigitOrUnderscore + : Digit + | '_' + ; + +fragment +Underscores + : '_'+ + ; + +fragment +HexNumeral + : '0' [xX] HexDigits + ; + +fragment +HexDigits + : HexDigit (HexDigitsAndUnderscores? HexDigit)? + ; + +fragment +HexDigit + : [0-9a-fA-F] + ; + +fragment +HexDigitsAndUnderscores + : HexDigitOrUnderscore+ + ; + +fragment +HexDigitOrUnderscore + : HexDigit + | '_' + ; + +fragment +OctalNumeral + : '0' Underscores? OctalDigits + ; + +fragment +OctalDigits + : OctalDigit (OctalDigitsAndUnderscores? OctalDigit)? + ; + +fragment +OctalDigit + : [0-7] + ; + +fragment +OctalDigitsAndUnderscores + : OctalDigitOrUnderscore+ + ; + +fragment +OctalDigitOrUnderscore + : OctalDigit + | '_' + ; + +fragment +BinaryNumeral + : '0' [bB] BinaryDigits + ; + +fragment +BinaryDigits + : BinaryDigit (BinaryDigitsAndUnderscores? BinaryDigit)? + ; + +fragment +BinaryDigit + : [01] + ; + +fragment +BinaryDigitsAndUnderscores + : BinaryDigitOrUnderscore+ + ; + +fragment +BinaryDigitOrUnderscore + : BinaryDigit + | '_' + ; + +// §3.10.2 Floating-Point Literals FloatingPointLiteral - : ('0'..'9')+ '.' ('0'..'9')* Exponent? FloatTypeSuffix? - | '.' ('0'..'9')+ Exponent? FloatTypeSuffix? - | ('0'..'9')+ Exponent FloatTypeSuffix? - | ('0'..'9')+ FloatTypeSuffix - | '0' ('x'|'X') - ( HexDigit+ ('.' HexDigit*)? HexExponent FloatTypeSuffix? - | '.' HexDigit+ HexExponent FloatTypeSuffix? - ) - ; + : DecimalFloatingPointLiteral + | HexadecimalFloatingPointLiteral + ; fragment -Exponent : ('e'|'E') ('+'|'-')? ('0'..'9')+ ; +DecimalFloatingPointLiteral + : Digits '.' Digits? ExponentPart? FloatTypeSuffix? + | '.' Digits ExponentPart? FloatTypeSuffix? + | Digits ExponentPart FloatTypeSuffix? + | Digits FloatTypeSuffix + ; fragment -HexExponent : ('p'|'P') ('+'|'-')? ('0'..'9')+ ; +ExponentPart + : ExponentIndicator SignedInteger + ; fragment -FloatTypeSuffix : ('f'|'F'|'d'|'D') ; +ExponentIndicator + : [eE] + ; + +fragment +SignedInteger + : Sign? Digits + ; + +fragment +Sign + : [+-] + ; + +fragment +FloatTypeSuffix + : [fFdD] + ; + +fragment +HexadecimalFloatingPointLiteral + : HexSignificand BinaryExponent FloatTypeSuffix? + ; + +fragment +HexSignificand + : HexNumeral '.'? + | '0' [xX] HexDigits? '.' HexDigits + ; + +fragment +BinaryExponent + : BinaryExponentIndicator SignedInteger + ; + +fragment +BinaryExponentIndicator + : [pP] + ; + +// §3.10.3 Boolean Literals + +BooleanLiteral + : 'true' + | 'false' + ; + +// §3.10.4 Character Literals CharacterLiteral - : '\'' ( EscapeSequence | ~('\''|'\\') ) '\'' - ; + : '\'' SingleCharacter '\'' + | '\'' EscapeSequence '\'' + ; + +fragment +SingleCharacter + : ~['\\] + ; + +// §3.10.5 String Literals StringLiteral - : '"' ( EscapeSequence | ~('\\'|'"') )* '"' - ; + : '"' StringCharacters? '"' + ; + +fragment +StringCharacters + : StringCharacter+ + ; + +fragment +StringCharacter + : ~["\\] + | EscapeSequence + ; + +// §3.10.6 Escape Sequences for Character and String Literals fragment EscapeSequence - : '\\' ('b'|'t'|'n'|'f'|'r'|'\"'|'\''|'\\') - | UnicodeEscape - | OctalEscape - ; + : '\\' [btnfr"'\\] + | OctalEscape + ; fragment OctalEscape - : '\\' ('0'..'3') ('0'..'7') ('0'..'7') - | '\\' ('0'..'7') ('0'..'7') - | '\\' ('0'..'7') - ; + : '\\' OctalDigit + | '\\' OctalDigit OctalDigit + | '\\' ZeroToThree OctalDigit OctalDigit + ; fragment -UnicodeEscape - : '\\' 'u' HexDigit HexDigit HexDigit HexDigit - ; +ZeroToThree + : [0-3] + ; -ENUM: 'enum' {enumIsKeyword}? - ; - -ASSERT - : 'assert' {assertIsKeyword}? - ; - -Identifier - : Letter (Letter|JavaIDDigit)* - ; +// §3.10.7 The Null Literal -/**I found this char range in JavaCC's grammar, but Letter and Digit overlap. - Still works, but... - */ -fragment -Letter - : '\u0024' | - '\u0041'..'\u005a' | - '\u005f' | - '\u0061'..'\u007a' | - '\u00c0'..'\u00d6' | - '\u00d8'..'\u00f6' | - '\u00f8'..'\u00ff' | - '\u0100'..'\u1fff' | - '\u3040'..'\u318f' | - '\u3300'..'\u337f' | - '\u3400'..'\u3d2d' | - '\u4e00'..'\u9fff' | - '\uf900'..'\ufaff' - ; +NullLiteral + : 'null' + ; + +// §3.11 Separators + +LPAREN : '('; +RPAREN : ')'; +LBRACE : '{'; +RBRACE : '}'; +LBRACK : '['; +RBRACK : ']'; +SEMI : ';'; +COMMA : ','; +DOT : '.'; + +// §3.12 Operators + +ASSIGN : '='; +GT : '>'; +LT : '<'; +BANG : '!'; +TILDE : '~'; +QUESTION : '?'; +COLON : ':'; +EQUAL : '=='; +LE : '<='; +GE : '>='; +NOTEQUAL : '!='; +AND : '&&'; +OR : '||'; +INC : '++'; +DEC : '--'; +ADD : '+'; +SUB : '-'; +MUL : '*'; +DIV : '/'; +BITAND : '&'; +BITOR : '|'; +CARET : '^'; +MOD : '%'; + +ADD_ASSIGN : '+='; +SUB_ASSIGN : '-='; +MUL_ASSIGN : '*='; +DIV_ASSIGN : '/='; +AND_ASSIGN : '&='; +OR_ASSIGN : '|='; +XOR_ASSIGN : '^='; +MOD_ASSIGN : '%='; +LSHIFT_ASSIGN : '<<='; +RSHIFT_ASSIGN : '>>='; +URSHIFT_ASSIGN : '>>>='; + +// §3.8 Identifiers (must appear after all keywords in the grammar) + +Identifier + : JavaLetter JavaLetterOrDigit* + ; fragment -JavaIDDigit - : '\u0030'..'\u0039' | - '\u0660'..'\u0669' | - '\u06f0'..'\u06f9' | - '\u0966'..'\u096f' | - '\u09e6'..'\u09ef' | - '\u0a66'..'\u0a6f' | - '\u0ae6'..'\u0aef' | - '\u0b66'..'\u0b6f' | - '\u0be7'..'\u0bef' | - '\u0c66'..'\u0c6f' | - '\u0ce6'..'\u0cef' | - '\u0d66'..'\u0d6f' | - '\u0e50'..'\u0e59' | - '\u0ed0'..'\u0ed9' | - '\u1040'..'\u1049' - ; +JavaLetter + : [a-zA-Z$_] // these are the "java letters" below 0xFF + | // covers all characters above 0xFF which are not a surrogate + ~[\u0000-\u00FF\uD800-\uDBFF] + {IsJavaIdentifierCharacter((char)_input.La(-1), true)}? + //| // covers UTF-16 surrogate pairs encodings for U+10000 to U+10FFFF + // [\uD800-\uDBFF] [\uDC00-\uDFFF] + // {Character.isJavaIdentifierStart(Character.toCodePoint((char)_input.La(-2), (char)_input.La(-1)))}? + ; -WS : (' '|'\r'|'\t'|'\u000C'|'\n')+ -> channel(HIDDEN) +fragment +JavaLetterOrDigit + : [a-zA-Z0-9$_] // these are the "java letters or digits" below 0xFF + | // covers all characters above 0xFF which are not a surrogate + ~[\u0000-\u00FF\uD800-\uDBFF] + {IsJavaIdentifierCharacter((char)_input.La(-1), false)}? + //| // covers UTF-16 surrogate pairs encodings for U+10000 to U+10FFFF + // [\uD800-\uDBFF] [\uDC00-\uDFFF] + // {Character.isJavaIdentifierPart(Character.toCodePoint((char)_input.La(-2), (char)_input.La(-1)))}? + ; + +// +// Additional symbols not defined in the lexical specification +// + +AT : '@'; +ELLIPSIS : '...'; + +// +// Whitespace and comments +// + +WS : [ \t\r\n\u000C]+ -> skip ; COMMENT - : '/*' .*? '*/' -> channel(HIDDEN) + : '/*' .*? '*/' -> skip ; LINE_COMMENT - : '//' ~('\n'|'\r')* '\r'? '\n' -> channel(HIDDEN) + : '//' ~[\r\n]* -> skip ; diff --git a/runtime/CSharp/Antlr4.Runtime.Test/JavaUnicodeInputStream.cs b/runtime/CSharp/Antlr4.Runtime.Test/JavaUnicodeInputStream.cs new file mode 100644 index 000000000..18615e6e8 --- /dev/null +++ b/runtime/CSharp/Antlr4.Runtime.Test/JavaUnicodeInputStream.cs @@ -0,0 +1,272 @@ +namespace Antlr4.Runtime.Test +{ + using System; + using System.Collections.Generic; + using System.Diagnostics; + using Antlr4.Runtime.Misc; + + public class JavaUnicodeInputStream : ICharStream + { + [NotNull] + private readonly ICharStream _source; + private readonly List _escapeIndexes = new List(); + private readonly List _escapeCharacters = new List(); + private readonly List _escapeIndirectionLevels = new List(); + + private int _escapeListIndex; + private int _range; + private int _slashCount; + + private int _la1; + + public JavaUnicodeInputStream([NotNull] ICharStream source) + { + if (source == null) + throw new ArgumentNullException("source"); + + this._source = source; + this._la1 = source.La(1); + } + + public int Size + { + get + { + return _source.Size; + } + } + + public int Index + { + get + { + return _source.Index; + } + } + + public string SourceName + { + get + { + return _source.SourceName; + } + } + + public String GetText(Interval interval) + { + return _source.GetText(interval); + } + + public void Consume() + { + if (_la1 != '\\') + { + _source.Consume(); + _la1 = _source.La(1); + _range = Math.Max(_range, _source.Index); + _slashCount = 0; + return; + } + + // make sure the next character has been processed + this.La(1); + + if (_escapeListIndex >= _escapeIndexes.Count || _escapeIndexes[_escapeListIndex] != Index) + { + _source.Consume(); + _slashCount++; + } + else + { + int indirectionLevel = _escapeIndirectionLevels[_escapeListIndex]; + for (int i = 0; i < 6 + indirectionLevel; i++) + { + _source.Consume(); + } + + _escapeListIndex++; + _slashCount = 0; + } + + _la1 = _source.La(1); + Debug.Assert(_range >= Index); + } + + public int La(int i) + { + if (i == 1 && _la1 != '\\') + { + return _la1; + } + + if (i <= 0) + { + int desiredIndex = Index + i; + for (int j = _escapeListIndex - 1; j >= 0; j--) + { + if (_escapeIndexes[j] + 6 + _escapeIndirectionLevels[j] > desiredIndex) + { + desiredIndex -= 5 + _escapeIndirectionLevels[j]; + } + + if (_escapeIndexes[j] == desiredIndex) + { + return _escapeCharacters[j]; + } + } + + return _source.La(desiredIndex - Index); + } + else + { + int desiredIndex = Index + i - 1; + for (int j = _escapeListIndex; j < _escapeIndexes.Count; j++) + { + if (_escapeIndexes[j] == desiredIndex) + { + return _escapeCharacters[j]; + } + else if (_escapeIndexes[j] < desiredIndex) + { + desiredIndex += 5 + _escapeIndirectionLevels[j]; + } + else + { + return _source.La(desiredIndex - Index + 1); + } + } + + int currentIndex = Index; + int slashCount = _slashCount; + int indirectionLevel = 0; + for (int j = 0; j < i; j++) + { + int previousIndex = currentIndex; + int c = ReadCharAt(ref currentIndex, ref slashCount, ref indirectionLevel); + if (currentIndex > _range) + { + if (currentIndex - previousIndex > 1) + { + _escapeIndexes.Add(previousIndex); + _escapeCharacters.Add(c); + _escapeIndirectionLevels.Add(indirectionLevel); + } + + _range = currentIndex; + } + + if (j == i - 1) + { + return c; + } + } + + throw new InvalidOperationException("shouldn't be reachable"); + } + } + + public int Mark() + { + return _source.Mark(); + } + + public void Release(int marker) + { + _source.Release(marker); + } + + public void Seek(int index) + { + if (index > _range) + { + throw new NotSupportedException(); + } + + _source.Seek(index); + _la1 = _source.La(1); + + _slashCount = 0; + while (_source.La(-_slashCount - 1) == '\\') + { + _slashCount++; + } + + _escapeListIndex = _escapeIndexes.BinarySearch(_source.Index); + if (_escapeListIndex < 0) + { + _escapeListIndex = -_escapeListIndex - 1; + } + } + + private static bool IsHexDigit(int c) + { + return c >= '0' && c <= '9' + || c >= 'a' && c <= 'f' + || c >= 'A' && c <= 'F'; + } + + private static int HexValue(int c) + { + if (c >= '0' && c <= '9') + { + return c - '0'; + } + + if (c >= 'a' && c <= 'f') + { + return c - 'a' + 10; + } + + if (c >= 'A' && c <= 'F') + { + return c - 'A' + 10; + } + + throw new ArgumentException("c"); + } + + private int ReadCharAt(ref int nextIndex, ref int slashCount, ref int indirectionLevel) + { + bool blockUnicodeEscape = (slashCount % 2) != 0; + + int c0 = _source.La(nextIndex - Index + 1); + if (c0 == '\\') + { + slashCount++; + + if (!blockUnicodeEscape) + { + int c1 = _source.La(nextIndex - Index + 2); + if (c1 == 'u') + { + int c2 = _source.La(nextIndex - Index + 3); + indirectionLevel = 0; + while (c2 == 'u') + { + indirectionLevel++; + c2 = _source.La(nextIndex - Index + 3 + indirectionLevel); + } + + int c3 = _source.La(nextIndex - Index + 4 + indirectionLevel); + int c4 = _source.La(nextIndex - Index + 5 + indirectionLevel); + int c5 = _source.La(nextIndex - Index + 6 + indirectionLevel); + if (IsHexDigit(c2) && IsHexDigit(c3) && IsHexDigit(c4) && IsHexDigit(c5)) + { + int value = HexValue(c2); + value = (value << 4) + HexValue(c3); + value = (value << 4) + HexValue(c4); + value = (value << 4) + HexValue(c5); + + nextIndex += 6 + indirectionLevel; + slashCount = 0; + return value; + } + } + } + } + + nextIndex++; + return c0; + } + } +} diff --git a/runtime/CSharp/Antlr4.Runtime.Test/TestPerformance.cs b/runtime/CSharp/Antlr4.Runtime.Test/TestPerformance.cs index 217f6c9cc..cf19a81fd 100644 --- a/runtime/CSharp/Antlr4.Runtime.Test/TestPerformance.cs +++ b/runtime/CSharp/Antlr4.Runtime.Test/TestPerformance.cs @@ -18,7 +18,9 @@ using File = System.IO.File; using FileInfo = System.IO.FileInfo; using Interlocked = System.Threading.Interlocked; + using IOException = System.IO.IOException; using Path = System.IO.Path; + using SearchOption = System.IO.SearchOption; using Stopwatch = System.Diagnostics.Stopwatch; using Stream = System.IO.Stream; using StreamReader = System.IO.StreamReader; @@ -44,6 +46,18 @@ * {@link #TOP_PACKAGE}. */ private static readonly bool RECURSIVE = true; + /** + * {@code true} to read all source files from disk into memory before + * starting the parse. The default value is {@code true} to help prevent + * drive speed from affecting the performance results. This value may be set + * to {@code false} to support parsing large input sets which would not + * otherwise fit into memory. + */ + private static readonly bool PRELOAD_SOURCES = true; + /** + * The encoding to use when reading source files. + */ + private static readonly Encoding ENCODING = Encoding.UTF8; /** * {@code true} to use the Java grammar with expressions in the v4 @@ -95,7 +109,7 @@ * {@code true} to use {@link BailErrorStrategy}, {@code false} to use * {@link DefaultErrorStrategy}. */ - private static readonly bool BAIL_ON_ERROR = true; + private static readonly bool BAIL_ON_ERROR = false; /** * {@code true} to compute a checksum for verifying consistency across * optimizations and multiple passes. @@ -230,7 +244,7 @@ DirectoryInfo directory = new DirectoryInfo(jdkSourceRoot); Assert.IsTrue(directory.Exists); - IEnumerable sources = loadSources(directory, "*.java", RECURSIVE); + IEnumerable sources = LoadSources(directory, "*.java", RECURSIVE); Console.Out.Write(getOptionsDescription(TOP_PACKAGE)); @@ -327,7 +341,7 @@ * This method is separate from {@link #parse2} so the first pass can be distinguished when analyzing * profiler results. */ - protected void parse1(ParserFactory factory, IEnumerable sources) + protected void parse1(ParserFactory factory, IEnumerable sources) { GC.Collect(); parseSources(factory, sources); @@ -337,49 +351,33 @@ * This method is separate from {@link #parse1} so the first pass can be distinguished when analyzing * profiler results. */ - protected void parse2(ParserFactory factory, IEnumerable sources) + protected void parse2(ParserFactory factory, IEnumerable sources) { GC.Collect(); parseSources(factory, sources); } - protected IEnumerable loadSources(DirectoryInfo directory, string filter, bool recursive) + protected IList LoadSources(DirectoryInfo directory, string filter, bool recursive) { - return loadSources(directory, filter, null, recursive); - } - - protected IEnumerable loadSources(DirectoryInfo directory, string filter, Encoding encoding, bool recursive) - { - ICollection result = new List(); - loadSources(directory, filter, encoding, recursive, result); + IList result = new List(); + LoadSources(directory, filter, recursive, result); return result; } - protected void loadSources(DirectoryInfo directory, string filter, Encoding encoding, bool recursive, ICollection result) + protected void LoadSources(DirectoryInfo directory, string filter, bool recursive, ICollection result) { Debug.Assert(directory.Exists); - FileInfo[] sources = directory.GetFiles(filter); + FileInfo[] sources = directory.GetFiles(filter, recursive ? SearchOption.AllDirectories : SearchOption.TopDirectoryOnly); foreach (FileInfo file in sources) { - var stream = new StreamReader(File.OpenRead(file.FullName), encoding); - ICharStream input = new AntlrInputStream(stream); - result.Add(input); - } - - if (recursive) - { - DirectoryInfo[] children = directory.GetDirectories(); - foreach (DirectoryInfo child in children) - { - loadSources(child, filter, encoding, true, result); - } + result.Add(new InputDescriptor(file.FullName)); } } int configOutputSize = 0; - protected void parseSources(ParserFactory factory, IEnumerable sources) + protected void parseSources(ParserFactory factory, IEnumerable sources) { Stopwatch startTime = Stopwatch.StartNew(); Thread.VolatileWrite(ref tokenCount, 0); @@ -397,8 +395,9 @@ #else ICollection> results = new List>(); #endif - foreach (ICharStream input in sources) + foreach (InputDescriptor inputDescriptor in sources) { + ICharStream input = inputDescriptor.GetInputStream(); sourceCount++; input.Seek(0); inputSize += input.Size; @@ -1094,5 +1093,107 @@ updateChecksum(checksum, ctx.Stop); } } + + protected sealed class InputDescriptor + { + private readonly string source; + private WeakReference inputStream; + private CloneableAntlrFileStream strongInputStream; + + public InputDescriptor([NotNull] String source) + { + this.source = source; + if (PRELOAD_SOURCES) + { + GetInputStream(); + } + } + + [return: NotNull] + public ICharStream GetInputStream() + { + CloneableAntlrFileStream stream; + if (!TryGetTarget(out stream)) + { + stream = new CloneableAntlrFileStream(source, ENCODING); + SetTarget(stream); + } + + return new JavaUnicodeInputStream(stream.CreateCopy()); + } + + private void SetTarget(CloneableAntlrFileStream stream) + { + if (PRELOAD_SOURCES) + { + strongInputStream = stream; + } + else + { + inputStream = new WeakReference(stream); + } + } + + private bool TryGetTarget(out CloneableAntlrFileStream stream) + { + if (PRELOAD_SOURCES) + { + stream = strongInputStream; + return strongInputStream != null; + } + else + { + if (inputStream == null) + { + stream = null; + return false; + } + + return inputStream.TryGetTarget(out stream); + } + } + } + +#if PORTABLE + protected class CloneableAntlrFileStream : AntlrInputStream +#else + protected class CloneableAntlrFileStream : AntlrFileStream +#endif + { + public CloneableAntlrFileStream(String fileName, Encoding encoding) +#if PORTABLE + : base(File.ReadAllText(fileName, encoding)) +#else + : base(fileName, encoding) +#endif + { + } + + public AntlrInputStream CreateCopy() + { + AntlrInputStream stream = new AntlrInputStream(this.data, this.n); + stream.name = this.SourceName; + return stream; + } + } + +#if !NET45 + private sealed class WeakReference + where T : class + { + private readonly WeakReference _reference; + + public WeakReference(T reference) + { + _reference = new WeakReference(reference); + } + + public bool TryGetTarget(out T reference) + { + reference = (T)_reference.Target; + return reference != null; + } + } +#endif } }