feat: added C++ port of TypeScript PythonLexerBase for Python 3.13
Ported missing PythonParseBase from TypeScript to C++. This is needed so that one can create fully working Python parser.
Thanks for this, but the code doesn't compile/link. Correcting the compilation errors, it crashes.
The first thing you'll need to do is to fit this into the build because as is, nothing is compiled/linked or tested for the Cpp target.
You need these changes.
- The desc.xml must have
Cppadded to the<targets>. Testing won't build anything if you don't declare that the Cpp target actually "works". - The grammar file needs
@headerto add#include "PythonLexerBase.h"in the generated file PythonLexer.h. But, the@headerblock is always needed to be in a target-specific language. We can't allow that because PythonLexer.g4 is supposed to be "target agnostic". To work around this, you have to have a string in the .g4 that gets modified just prior to calling the Antlr4 Tool to generate the .cpp and .h files. That is the whole purpose of "transformGrammar.h". - I couldn't get anything to compile with
usingstatements. It all compiles if I use fully qualified namespaces for Antlr4 types. So, instead ofToken, it should beantlr4::Token.
Applying this, these are the changes you need to perform to your code.
$ git diff .
diff --git a/python/python3_13/Cpp/PythonLexerBase.cpp b/python/python3_13/Cpp/PythonLexerBase.cpp
index e35d8ee5..116d366c 100644
--- a/python/python3_13/Cpp/PythonLexerBase.cpp
+++ b/python/python3_13/Cpp/PythonLexerBase.cpp
@@ -1,4 +1,7 @@
+#include "antlr4-runtime.h"
#include "PythonLexerBase.h"
+#include "PythonLexer.h"
+#include "PythonParser.h"
using namespace antlr4;
@@ -148,7 +151,7 @@ void PythonLexerBase::setCurrentAndFollowingTokens() {
if (this->ffgToken) {
this->curToken = this->cloneToken(this->ffgToken);
} else {
- this->curToken = PythonLexer::nextToken();
+ this->curToken = this->Lexer::nextToken();
}
this->checkCurToken(); // ffgToken cannot be used in this method and its sub methods (ffgToken is not yet set)!
@@ -156,7 +159,7 @@ void PythonLexerBase::setCurrentAndFollowingTokens() {
if (this->curToken->getType() == PythonLexer::EOF) {
this->ffgToken = this->cloneToken(this->ffgToken);
} else {
- this->ffgToken = PythonLexer::nextToken();
+ this->ffgToken = this->Lexer::nextToken();
}
}
diff --git a/python/python3_13/Cpp/PythonLexerBase.h b/python/python3_13/Cpp/PythonLexerBase.h
index 24b63372..63305f43 100644
--- a/python/python3_13/Cpp/PythonLexerBase.h
+++ b/python/python3_13/Cpp/PythonLexerBase.h
@@ -35,23 +35,20 @@ THE SOFTWARE.
#include <vector>
#include <regex>
#include "antlr4-runtime.h"
-#include "PythonLexer.h"
-#include "PythonParser.h"
+//#include "PythonLexer.h"
-using namespace antlr4;
-
-class PythonLexerBase : public PythonLexer {
+class PythonLexerBase : public antlr4::Lexer {
public:
- explicit PythonLexerBase(CharStream *input): PythonLexer(input) {
+ PythonLexerBase(antlr4::CharStream *input): antlr4::Lexer(input) {
this->init();
}
- virtual std::unique_ptr<Token> nextToken() override;
+ virtual std::unique_ptr<antlr4::Token> nextToken() override;
virtual void reset() override;
private:
- std::unique_ptr<Token> cloneToken(const std::unique_ptr<Token> &source);
- std::unique_ptr<Token> cloneToken(const std::unique_ptr<Token> &source, size_t channel);
- std::unique_ptr<Token> cloneToken(const std::unique_ptr<Token> &source, const std::string &text);
- std::unique_ptr<Token> cloneToken(const std::unique_ptr<Token> &source, size_t channel, const std::string &text, size_t type);
+ std::unique_ptr<antlr4::Token> cloneToken(const std::unique_ptr<antlr4::Token> &source);
+ std::unique_ptr<antlr4::Token> cloneToken(const std::unique_ptr<antlr4::Token> &source, size_t channel);
+ std::unique_ptr<antlr4::Token> cloneToken(const std::unique_ptr<antlr4::Token> &source, const std::string &text);
+ std::unique_ptr<antlr4::Token> cloneToken(const std::unique_ptr<antlr4::Token> &source, size_t channel, const std::string &text, size_t type);
void init();
void checkNextToken();
void setCurrentAndFollowingTokens();
@@ -81,10 +78,10 @@ private:
bool isDictionaryComprehensionOrSetComprehension(const std::string &code);
void insertTrailingTokens();
void handleEOFtoken();
- void hideAndAddPendingToken(const std::unique_ptr<Token> &token);
- void createAndAddPendingToken(size_t type, size_t channel, const std::string &text, const std::unique_ptr<Token> &sampleToken);
- void createAndAddPendingToken(size_t type, size_t channel, const std::unique_ptr<Token> &sampleToken);
- void addPendingToken(const std::unique_ptr<Token> &token);
+ void hideAndAddPendingToken(const std::unique_ptr<antlr4::Token> &token);
+ void createAndAddPendingToken(size_t type, size_t channel, const std::string &text, const std::unique_ptr<antlr4::Token> &sampleToken);
+ void createAndAddPendingToken(size_t type, size_t channel, const std::unique_ptr<antlr4::Token> &sampleToken);
+ void addPendingToken(const std::unique_ptr<antlr4::Token> &token);
size_t getIndentationLength(const std::string &identText);
void reportLexerError(const std::string &errMsg);
void reportError(const std::string &errMsg);
@@ -92,7 +89,7 @@ private:
// A stack that keeps track of the indentation lengths
std::stack<size_t> indentLengthStack;
// A list where tokens are waiting to be loaded into the token stream
- std::vector<std::unique_ptr<Token>> pendingTokens;
+ std::vector<std::unique_ptr<antlr4::Token>> pendingTokens;
// last pending token types
size_t previousPendingTokenType;
size_t lastPendingTokenTypeFromDefaultChannel;
@@ -113,9 +110,9 @@ private:
bool wasTabIndentation;
bool wasIndentationMixedWithSpacesAndTabs;
- std::unique_ptr<Token> curToken; // current (under processing) token
- std::unique_ptr<Token> ffgToken; // following (look ahead) token
+ std::unique_ptr<antlr4::Token> curToken; // current (under processing) token
+ std::unique_ptr<antlr4::Token> ffgToken; // following (look ahead) token
const ssize_t INVALID_LENGTH = -1;
const std::string ERR_TXT = " ERROR: ";
-};
\ No newline at end of file
+};
diff --git a/python/python3_13/PythonLexer.g4 b/python/python3_13/PythonLexer.g4
index 98b99d4a..f2c036b9 100644
--- a/python/python3_13/PythonLexer.g4
+++ b/python/python3_13/PythonLexer.g4
@@ -32,6 +32,8 @@ lexer grammar PythonLexer;
options { superClass=PythonLexerBase; }
+// Insert here @header for lexer.
+
tokens {
ENCODING // https://docs.python.org/3.13/reference/lexical_analysis.html#encoding-declarations
, INDENT, DEDENT // https://docs.python.org/3.13/reference/lexical_analysis.html#indentation
diff --git a/python/python3_13/desc.xml b/python/python3_13/desc.xml
index 8aa6fdea..3ef04309 100644
--- a/python/python3_13/desc.xml
+++ b/python/python3_13/desc.xml
@@ -3,7 +3,7 @@
<antlr-version>^4.13.2</antlr-version>
<targets>CSharp;Java;Python3;JavaScript;TypeScript</targets>
<test>
- <targets>CSharp;Java;Python3;JavaScript;TypeScript</targets>
+ <targets>Cpp;CSharp;Java;Python3;JavaScript;TypeScript</targets>
<entry-point>file_input</entry-point>
<inputs>examples</inputs>
</test>
03/12-21:33:27 ~/issues/g4-4440/python/python3_13
You will need to place this file, transformGrammar.py, in the Cpp/ directory along with ParserLexerBase.h and ParserLexerBase.cpp.
$ cat Cpp/transformGrammar.py
import sys, os, re, shutil
from glob import glob
from pathlib import Path
def main(argv):
for file in glob("./*.g4"):
fix(file)
def fix(file_path):
print("Altering " + file_path)
if not os.path.exists(file_path):
print(f"Could not find file: {file_path}")
sys.exit(1)
parts = os.path.split(file_path)
file_name = parts[-1]
shutil.move(file_path, file_path + ".bak")
input_file = open(file_path + ".bak",'r')
output_file = open(file_path, 'w')
for x in input_file:
if '// Insert here @header for lexer.' in x:
x = x.replace('// Insert here @header for lexer.', '@header {#include "PythonLexerBase.h"}')
if 'this.' in x:
x = x.replace('this.', 'this->')
output_file.write(x)
output_file.flush()
print("Writing ...")
input_file.close()
output_file.close()
if __name__ == '__main__':
main(sys.argv)
@kaby76 ok, I see. This is because I used the TypeScript version as a source and it contains class inherited from PythonLexer, not from Lexer.
Will try the changes you suggested in a couple of days
That doesn't sound right because PythonLexerBase is derived from Lexer not PythonLexer in the TypeScript port: https://github.com/antlr/grammars-v4/blob/56308f1c292051262306a8a604a225a4768e6663/python/python3_13/TypeScript/PythonLexerBase.ts#L35
@kaby76 should i hold on this one before merging?
@teverett Hold please. The PR is not being teated and it contains many compilation issues.
@kaby76 I applied modifications you mentioned and was able to compile the code on my machine. Also, I fixed some typos which led to segfaults. Could you approve the workflow to test my changes, please?
P. S. sorry for being late, had too much work(
@teverett Could we get a build of this PR? Thanks!