diff --git a/.gitattributes b/.gitattributes index 00a7b00c..1df6e00c 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1 +1,3 @@ .git_archival.txt export-subst +cf_units/_udunits2_parser/parser/**/*.py linguist-generated=true +cf_units/_udunits2_parser/_antlr4_runtime/**/*.py linguist-generated=true diff --git a/cf_units/_udunits2_parser/README.md b/cf_units/_udunits2_parser/README.md index a746f3ed..dae6f6c8 100644 --- a/cf_units/_udunits2_parser/README.md +++ b/cf_units/_udunits2_parser/README.md @@ -13,9 +13,10 @@ a number of convenient lexical elements. Once the Jinja2 template has been expanded, the [ANTLR Java library](https://github.com/antlr/antlr4) is used to -compile the grammar into the targetted runtime language. +compile the grammar into the targeted runtime language. [A script](compile.py) is provided to automate this as much as possible. +It has a dependency on pip, Jinja2, Java and ruff. The compiled parser is committed to the repository for ease of deployment and testing (we know it isn't ideal, but it really does make things easier). @@ -24,9 +25,23 @@ changes to the grammar being proposed so that the two can remain in synch. ### Updating the ANTLR version -The above script downloads a Java Jar which needs updating to the same version -as antlr4-python3-runtime specified in the python requirements. Once these have -been updated, run [the script](compile.py) to regenerate the parser. +The [compile.py script](compile.py) copies the ANTLR4 runtime into the _antlr4_runtime +directory, and this should be commited to the repository. This means that we do not +have a runtime dependency on ANTLR4 (which was found to be challenging due to the +fact that you need to pin to a specific version of the ANTLR4 runtime, and aligning +this version with other libraries which also have an ANTLR4 dependency is impractical). + +Since the generated code is committed to this repo, and the ANTRL4 runtime is also vendored into it, we won't ever need to run ANTLR4 unless the grammar changes. + +So, we will only change the ANTLR4 version if we need new features of the +parser/lexer generators, or it becomes difficult to support the older version. + +Upgrading the ANTLR4 version is a simple matter of changing `ANTLR_VERSION` in the compile.py +script, and then re-running it. This should re-generate the parser/lexer, and update +the content in the _antlr4_runtime directory. One complexity may be that the imports +of the ANTRL4 runtime need to be rewritten to support vendoring, and the code needed +to do so may change from version to version. This topic is being followed upstream +with the ANTRL4 project with the hope of making this easier and/or built-in to ANTLR4. ### Testing the grammar diff --git a/cf_units/_udunits2_parser/__init__.py b/cf_units/_udunits2_parser/__init__.py index aec51503..63113cc5 100644 --- a/cf_units/_udunits2_parser/__init__.py +++ b/cf_units/_udunits2_parser/__init__.py @@ -5,10 +5,14 @@ import unicodedata -from antlr4 import CommonTokenStream, InputStream -from antlr4.error.ErrorListener import ErrorListener - from . import graph +from ._antlr4_runtime import ( + CommonTokenStream, + InputStream, +) +from ._antlr4_runtime.error.ErrorListener import ( + ErrorListener, +) from .parser.udunits2Lexer import udunits2Lexer from .parser.udunits2Parser import udunits2Parser from .parser.udunits2ParserVisitor import udunits2ParserVisitor diff --git a/cf_units/_udunits2_parser/_antlr4_runtime/BufferedTokenStream.py b/cf_units/_udunits2_parser/_antlr4_runtime/BufferedTokenStream.py new file mode 100644 index 00000000..a9b6ec4f --- /dev/null +++ b/cf_units/_udunits2_parser/_antlr4_runtime/BufferedTokenStream.py @@ -0,0 +1,309 @@ +# +# Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. +# Use of this file is governed by the BSD 3-clause license that +# can be found in the LICENSE.txt file in the project root. + +# This implementation of {@link TokenStream} loads tokens from a +# {@link TokenSource} on-demand, and places the tokens in a buffer to provide +# access to any previous token by index. +# +#

+# This token stream ignores the value of {@link Token#getChannel}. If your +# parser requires the token stream filter tokens to only those on a particular +# channel, such as {@link Token#DEFAULT_CHANNEL} or +# {@link Token#HIDDEN_CHANNEL}, use a filtering token stream such a +# {@link CommonTokenStream}.

+from io import StringIO + +from .error.Errors import IllegalStateException +from .Token import Token + +# need forward declaration +Lexer = None + + +# this is just to keep meaningful parameter types to Parser +class TokenStream: + pass + + +class BufferedTokenStream(TokenStream): + __slots__ = ("tokenSource", "tokens", "index", "fetchedEOF") + + def __init__(self, tokenSource: Lexer): + # The {@link TokenSource} from which tokens for this stream are fetched. + self.tokenSource = tokenSource + + # A collection of all tokens fetched from the token source. The list is + # considered a complete view of the input once {@link #fetchedEOF} is set + # to {@code true}. + self.tokens = [] + + # The index into {@link #tokens} of the current token (next token to + # {@link #consume}). {@link #tokens}{@code [}{@link #p}{@code ]} should be + # {@link #LT LT(1)}. + # + #

This field is set to -1 when the stream is first constructed or when + # {@link #setTokenSource} is called, indicating that the first token has + # not yet been fetched from the token source. For additional information, + # see the documentation of {@link IntStream} for a description of + # Initializing Methods.

+ self.index = -1 + + # Indicates whether the {@link Token#EOF} token has been fetched from + # {@link #tokenSource} and added to {@link #tokens}. This field improves + # performance for the following cases: + # + #