You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

PythonLexerBase.java 5.5 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186
  1. package depends.extractor.python.union;
  2. import org.antlr.v4.runtime.CharStream;
  3. import org.antlr.v4.runtime.CommonToken;
  4. import org.antlr.v4.runtime.Lexer;
  5. import org.antlr.v4.runtime.Token;
  6. import depends.extractor.python.PythonLexer;
  7. import java.util.Stack;
  8. public abstract class PythonLexerBase extends Lexer {
  9. public static int TabSize = 8;
  10. // The amount of opened braces, brackets and parenthesis.
  11. private int _opened;
  12. // The stack that keeps track of the indentation level.
  13. private Stack<Integer> _indents = new Stack<>();
  14. // A circular buffer where extra tokens are pushed on (see the NEWLINE and WS lexer rules).
  15. private int _firstTokensInd;
  16. private int _lastTokenInd;
  17. private Token[] _buffer = new Token[32];
  18. private Token _lastToken;
  19. protected PythonLexerBase(CharStream input) {
  20. super(input);
  21. }
  22. @Override
  23. public void emit(Token token) {
  24. super.setToken(token);
  25. if (_buffer[_firstTokensInd] != null)
  26. {
  27. _lastTokenInd = IncTokenInd(_lastTokenInd);
  28. if (_lastTokenInd == _firstTokensInd)
  29. {
  30. // Enlarge buffer
  31. Token[] newArray = new Token[_buffer.length * 2];
  32. int destInd = newArray.length - (_buffer.length - _firstTokensInd);
  33. System.arraycopy(_buffer, 0, newArray, 0, _firstTokensInd);
  34. System.arraycopy(_buffer, _firstTokensInd, newArray, destInd, _buffer.length - _firstTokensInd);
  35. _firstTokensInd = destInd;
  36. _buffer = newArray;
  37. }
  38. }
  39. _buffer[_lastTokenInd] = token;
  40. _lastToken = token;
  41. }
  42. @Override
  43. public Token nextToken() {
  44. // Check if the end-of-file is ahead and there are still some DEDENTS expected.
  45. if (_input.LA(1) == EOF && _indents.size() > 0)
  46. {
  47. if (_buffer[_lastTokenInd] == null || _buffer[_lastTokenInd].getType() != PythonLexer.LINE_BREAK)
  48. {
  49. // First emit an extra line break that serves as the end of the statement.
  50. emit(PythonLexer.LINE_BREAK);
  51. }
  52. // Now emit as much DEDENT tokens as needed.
  53. while (_indents.size() != 0)
  54. {
  55. emit(PythonLexer.DEDENT);
  56. _indents.pop();
  57. }
  58. }
  59. Token next = super.nextToken();
  60. if (_buffer[_firstTokensInd] == null)
  61. {
  62. return next;
  63. }
  64. Token result = _buffer[_firstTokensInd];
  65. _buffer[_firstTokensInd] = null;
  66. if (_firstTokensInd != _lastTokenInd)
  67. {
  68. _firstTokensInd = IncTokenInd(_firstTokensInd);
  69. }
  70. return result;
  71. }
  72. protected void HandleNewLine() {
  73. emit(PythonLexer.NEWLINE, HIDDEN, getText());
  74. char next = (char) _input.LA(1);
  75. // Process whitespaces in HandleSpaces
  76. if (next != ' ' && next != '\t' && IsNotNewLineOrComment(next))
  77. {
  78. ProcessNewLine(0);
  79. }
  80. }
  81. protected void HandleSpaces() {
  82. char next = (char) _input.LA(1);
  83. if ((_lastToken == null || _lastToken.getType() == PythonLexer.NEWLINE) && IsNotNewLineOrComment(next))
  84. {
  85. // Calculates the indentation of the provided spaces, taking the
  86. // following rules into account:
  87. //
  88. // "Tabs are replaced (from left to right) by one to eight spaces
  89. // such that the total number of characters up to and including
  90. // the replacement is a multiple of eight [...]"
  91. //
  92. // -- https://docs.python.org/3.1/reference/lexical_analysis.html#indentation
  93. int indent = 0;
  94. String text = getText();
  95. for (int i = 0; i < text.length(); i++) {
  96. indent += text.charAt(i) == '\t' ? TabSize - indent % TabSize : 1;
  97. }
  98. ProcessNewLine(indent);
  99. }
  100. emit(PythonLexer.WS, HIDDEN, getText());
  101. }
  102. protected void IncIndentLevel() {
  103. _opened++;
  104. }
  105. protected void DecIndentLevel() {
  106. if (_opened > 0) {
  107. --_opened;
  108. }
  109. }
  110. private boolean IsNotNewLineOrComment(char next) {
  111. return _opened == 0 && next != '\r' && next != '\n' && next != '\f' && next != '#';
  112. }
  113. private void ProcessNewLine(int indent) {
  114. emit(PythonLexer.LINE_BREAK);
  115. int previous = _indents.size() == 0 ? 0 : _indents.peek();
  116. if (indent > previous)
  117. {
  118. _indents.push(indent);
  119. emit(PythonLexer.INDENT);
  120. }
  121. else
  122. {
  123. // Possibly emit more than 1 DEDENT token.
  124. while (_indents.size() != 0 && _indents.peek() > indent)
  125. {
  126. emit(PythonLexer.DEDENT);
  127. _indents.pop();
  128. }
  129. }
  130. }
  131. private int IncTokenInd(int ind) {
  132. return (ind + 1) % _buffer.length;
  133. }
  134. private void emit(int tokenType) {
  135. emit(tokenType, DEFAULT_TOKEN_CHANNEL, "");
  136. }
  137. private void emit(int tokenType, int channel, String text) {
  138. int charIndex = getCharIndex();
  139. CommonToken token = new CommonToken(_tokenFactorySourcePair, tokenType, channel, charIndex - text.length(), charIndex);
  140. token.setLine(getLine());
  141. token.setCharPositionInLine(getCharPositionInLine());
  142. token.setText(text);
  143. emit(token);
  144. }
  145. }

人工智能研发终端

Contributors (2)