|
| 1 | +// ******* GRUN (Grammar Unit Test) for Python ******* |
| 2 | + |
| 3 | +using System; |
| 4 | +using System.Text.RegularExpressions; |
| 5 | +using System.Text; |
| 6 | +using Antlr4.Runtime; |
| 7 | + |
| 8 | +namespace grun4py |
| 9 | +{ |
| 10 | + internal static class Program |
| 11 | + { |
| 12 | + public static int Main(string[] args) |
| 13 | + { |
| 14 | + if (args.Length < 1) |
| 15 | + { |
| 16 | + Console.Error.WriteLine("Error: Please provide an input file path"); |
| 17 | + return 1; |
| 18 | + } |
| 19 | + |
| 20 | + try |
| 21 | + { |
| 22 | + var filePath = args[0]; |
| 23 | + var input = GetEncodedInputStreamByPythonComment(filePath); |
| 24 | + var lexer = new PythonLexer(input); |
| 25 | + var tokens = new CommonTokenStream((ITokenSource)lexer); |
| 26 | + var parser = new PythonParser(tokens); |
| 27 | + |
| 28 | + tokens.Fill(); // Test the lexer grammar |
| 29 | + foreach (IToken t in tokens.GetTokens()) |
| 30 | + { |
| 31 | + Console.WriteLine(GetTokenMetaDataWithTokenName(t)); |
| 32 | + } |
| 33 | + |
| 34 | + parser.file_input(); // Test the parser grammar |
| 35 | + return parser.NumberOfSyntaxErrors; |
| 36 | + |
| 37 | + } |
| 38 | + catch (Exception ex) |
| 39 | + { |
| 40 | + Console.Error.WriteLine($"Error: {ex.Message}"); |
| 41 | + return 1; // Error occurred, returning non-zero exit code |
| 42 | + } |
| 43 | + } |
| 44 | + |
| 45 | + private static string GetTokenMetaDataWithTokenName(IToken token) |
| 46 | + { |
| 47 | + string tokenText = ReplaceSpecialCharacters(token.Text); |
| 48 | + string tokenName = token.Type == TokenConstants.EOF ? "EOF" : PythonLexer.DefaultVocabulary.GetDisplayName(token.Type); |
| 49 | + string channelText = token.Channel == TokenConstants.DefaultChannel ? |
| 50 | + "" : |
| 51 | + $"channel={PythonLexer.channelNames[token.Channel]},"; |
| 52 | + |
| 53 | + // Modified format: [@TokenIndex,StartIndex:StopIndex='Text',<TokenName>,channel=ChannelName,Line:Column] |
| 54 | + return $"[@{token.TokenIndex},{token.StartIndex}:{token.StopIndex}='{tokenText}',<{tokenName}>,{channelText}{token.Line}:{token.Column}]"; |
| 55 | + } |
| 56 | + |
| 57 | + private static string ReplaceSpecialCharacters(string text) |
| 58 | + { |
| 59 | + return text.Replace("\n", @"\n") |
| 60 | + .Replace("\r", @"\r") |
| 61 | + .Replace("\t", @"\t") |
| 62 | + .Replace("\f", @"\f"); |
| 63 | + |
| 64 | + } |
| 65 | + |
| 66 | + public static ICharStream? GetEncodedInputStreamByPythonComment(string filePath) |
| 67 | + { |
| 68 | + string encodingName = ""; |
| 69 | + var ws_commentPattern = new Regex(@"^[ \t\f]*(#.*)?$"); |
| 70 | + |
| 71 | + try |
| 72 | + { |
| 73 | + using FileStream fs = new(filePath, FileMode.Open, FileAccess.Read); // read in binary mode |
| 74 | + using StreamReader reader = new(fs, Encoding.ASCII); |
| 75 | + for (int lineCount = 0; lineCount < 2; lineCount++) |
| 76 | + { |
| 77 | + string? line = reader.ReadLine(); |
| 78 | + if (line == null) |
| 79 | + { |
| 80 | + break; // EOF reached |
| 81 | + } |
| 82 | + |
| 83 | + if (ws_commentPattern.IsMatch(line)) // WS? + COMMENT? found |
| 84 | + { |
| 85 | + encodingName = GetEncodingName(line); |
| 86 | + if (encodingName != "") // encoding found |
| 87 | + { |
| 88 | + break; |
| 89 | + } |
| 90 | + } |
| 91 | + else |
| 92 | + { |
| 93 | + break; // statement or backslash found (line is not empty, not whitespace(s), not comment) |
| 94 | + } |
| 95 | + } |
| 96 | + } |
| 97 | + catch (Exception) |
| 98 | + { |
| 99 | + // Console.WriteLine($"An error occurred: {e.Message}"); |
| 100 | + } |
| 101 | + |
| 102 | + const string DEFAULT_PYTHON_ENCODING = "utf-8"; // default encoding for Python source code |
| 103 | + if (encodingName == "") |
| 104 | + { |
| 105 | + encodingName = DEFAULT_PYTHON_ENCODING; |
| 106 | + } |
| 107 | + |
| 108 | + try // encoding test for ANTLR4 |
| 109 | + { |
| 110 | + return CharStreams.fromPath(filePath, Encoding.GetEncoding(encodingName)); |
| 111 | + } |
| 112 | + catch (Exception) |
| 113 | + { |
| 114 | + return CharStreams.fromPath(filePath, Encoding.GetEncoding(DEFAULT_PYTHON_ENCODING)); |
| 115 | + } |
| 116 | + |
| 117 | + } |
| 118 | + |
| 119 | + public static string GetEncodingName(string commentText) // https://peps.python.org/pep-0263/#defining-the-encoding |
| 120 | + { |
| 121 | + var encodingCommentPattern = new Regex(@"^[ \t\f]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)"); |
| 122 | + var match = encodingCommentPattern.Match(commentText); |
| 123 | + if (match.Success) |
| 124 | + { |
| 125 | + string encodingName = match.Groups[1].Value; |
| 126 | + |
| 127 | + // normalize encoding name |
| 128 | + var encodingMap = new Dictionary<string, string> |
| 129 | + { |
| 130 | + { "cp1252", "latin1" }, |
| 131 | + { "latin-1", "latin1" }, |
| 132 | + { "iso-8859-1", "latin1" } |
| 133 | + // more encoding pairs |
| 134 | + }; |
| 135 | + |
| 136 | + return encodingMap.TryGetValue(encodingName.ToLower(), out var normalizedEncodingName) |
| 137 | + ? normalizedEncodingName |
| 138 | + : encodingName; |
| 139 | + } |
| 140 | + return ""; |
| 141 | + } |
| 142 | + } |
| 143 | +} |
0 commit comments