1: <?php declare(strict_types = 1);
2:
3: namespace PHPStan\PhpDocParser\Lexer;
4:
5: use PHPStan\PhpDocParser\ParserConfig;
6: use function implode;
7: use function preg_match_all;
8: use const PREG_SET_ORDER;
9:
10: /**
11: * Implementation based on Nette Tokenizer (New BSD License; https://github.com/nette/tokenizer)
12: */
13: class Lexer
14: {
15:
16: public const TOKEN_REFERENCE = 0;
17: public const TOKEN_UNION = 1;
18: public const TOKEN_INTERSECTION = 2;
19: public const TOKEN_NULLABLE = 3;
20: public const TOKEN_OPEN_PARENTHESES = 4;
21: public const TOKEN_CLOSE_PARENTHESES = 5;
22: public const TOKEN_OPEN_ANGLE_BRACKET = 6;
23: public const TOKEN_CLOSE_ANGLE_BRACKET = 7;
24: public const TOKEN_OPEN_SQUARE_BRACKET = 8;
25: public const TOKEN_CLOSE_SQUARE_BRACKET = 9;
26: public const TOKEN_COMMA = 10;
27: public const TOKEN_VARIADIC = 11;
28: public const TOKEN_DOUBLE_COLON = 12;
29: public const TOKEN_DOUBLE_ARROW = 13;
30: public const TOKEN_EQUAL = 14;
31: public const TOKEN_OPEN_PHPDOC = 15;
32: public const TOKEN_CLOSE_PHPDOC = 16;
33: public const TOKEN_PHPDOC_TAG = 17;
34: public const TOKEN_DOCTRINE_TAG = 18;
35: public const TOKEN_FLOAT = 19;
36: public const TOKEN_INTEGER = 20;
37: public const TOKEN_SINGLE_QUOTED_STRING = 21;
38: public const TOKEN_DOUBLE_QUOTED_STRING = 22;
39: public const TOKEN_DOCTRINE_ANNOTATION_STRING = 23;
40: public const TOKEN_IDENTIFIER = 24;
41: public const TOKEN_THIS_VARIABLE = 25;
42: public const TOKEN_VARIABLE = 26;
43: public const TOKEN_HORIZONTAL_WS = 27;
44: public const TOKEN_PHPDOC_EOL = 28;
45: public const TOKEN_OTHER = 29;
46: public const TOKEN_END = 30;
47: public const TOKEN_COLON = 31;
48: public const TOKEN_WILDCARD = 32;
49: public const TOKEN_OPEN_CURLY_BRACKET = 33;
50: public const TOKEN_CLOSE_CURLY_BRACKET = 34;
51: public const TOKEN_NEGATED = 35;
52: public const TOKEN_ARROW = 36;
53:
54: public const TOKEN_COMMENT = 37;
55:
56: public const TOKEN_LABELS = [
57: self::TOKEN_REFERENCE => '\'&\'',
58: self::TOKEN_UNION => '\'|\'',
59: self::TOKEN_INTERSECTION => '\'&\'',
60: self::TOKEN_NULLABLE => '\'?\'',
61: self::TOKEN_NEGATED => '\'!\'',
62: self::TOKEN_OPEN_PARENTHESES => '\'(\'',
63: self::TOKEN_CLOSE_PARENTHESES => '\')\'',
64: self::TOKEN_OPEN_ANGLE_BRACKET => '\'<\'',
65: self::TOKEN_CLOSE_ANGLE_BRACKET => '\'>\'',
66: self::TOKEN_OPEN_SQUARE_BRACKET => '\'[\'',
67: self::TOKEN_CLOSE_SQUARE_BRACKET => '\']\'',
68: self::TOKEN_OPEN_CURLY_BRACKET => '\'{\'',
69: self::TOKEN_CLOSE_CURLY_BRACKET => '\'}\'',
70: self::TOKEN_COMMA => '\',\'',
71: self::TOKEN_COMMENT => '\'//\'',
72: self::TOKEN_COLON => '\':\'',
73: self::TOKEN_VARIADIC => '\'...\'',
74: self::TOKEN_DOUBLE_COLON => '\'::\'',
75: self::TOKEN_DOUBLE_ARROW => '\'=>\'',
76: self::TOKEN_ARROW => '\'->\'',
77: self::TOKEN_EQUAL => '\'=\'',
78: self::TOKEN_OPEN_PHPDOC => '\'/**\'',
79: self::TOKEN_CLOSE_PHPDOC => '\'*/\'',
80: self::TOKEN_PHPDOC_TAG => 'TOKEN_PHPDOC_TAG',
81: self::TOKEN_DOCTRINE_TAG => 'TOKEN_DOCTRINE_TAG',
82: self::TOKEN_PHPDOC_EOL => 'TOKEN_PHPDOC_EOL',
83: self::TOKEN_FLOAT => 'TOKEN_FLOAT',
84: self::TOKEN_INTEGER => 'TOKEN_INTEGER',
85: self::TOKEN_SINGLE_QUOTED_STRING => 'TOKEN_SINGLE_QUOTED_STRING',
86: self::TOKEN_DOUBLE_QUOTED_STRING => 'TOKEN_DOUBLE_QUOTED_STRING',
87: self::TOKEN_DOCTRINE_ANNOTATION_STRING => 'TOKEN_DOCTRINE_ANNOTATION_STRING',
88: self::TOKEN_IDENTIFIER => 'type',
89: self::TOKEN_THIS_VARIABLE => '\'$this\'',
90: self::TOKEN_VARIABLE => 'variable',
91: self::TOKEN_HORIZONTAL_WS => 'TOKEN_HORIZONTAL_WS',
92: self::TOKEN_OTHER => 'TOKEN_OTHER',
93: self::TOKEN_END => 'TOKEN_END',
94: self::TOKEN_WILDCARD => '*',
95: ];
96:
97: public const VALUE_OFFSET = 0;
98: public const TYPE_OFFSET = 1;
99: public const LINE_OFFSET = 2;
100:
101: private ParserConfig $config; // @phpstan-ignore property.onlyWritten
102:
103: private ?string $regexp = null;
104:
105: public function __construct(ParserConfig $config)
106: {
107: $this->config = $config;
108: }
109:
110:
111: /**
112: * @return list<array{string, int, int}>
113: */
114: public function tokenize(string $s): array
115: {
116: if ($this->regexp === null) {
117: $this->regexp = $this->generateRegexp();
118: }
119:
120: preg_match_all($this->regexp, $s, $matches, PREG_SET_ORDER);
121:
122: $tokens = [];
123: $line = 1;
124: foreach ($matches as $match) {
125: $type = (int) $match['MARK'];
126: $tokens[] = [$match[0], $type, $line];
127: if ($type !== self::TOKEN_PHPDOC_EOL) {
128: continue;
129: }
130:
131: $line++;
132: }
133:
134: $tokens[] = ['', self::TOKEN_END, $line];
135:
136: return $tokens;
137: }
138:
139:
140: private function generateRegexp(): string
141: {
142: $patterns = [
143: self::TOKEN_HORIZONTAL_WS => '[\\x09\\x20]++',
144:
145: self::TOKEN_IDENTIFIER => '(?:[\\\\]?+[a-z_\\x80-\\xFF][0-9a-z_\\x80-\\xFF-]*+)++',
146: self::TOKEN_THIS_VARIABLE => '\\$this(?![0-9a-z_\\x80-\\xFF])',
147: self::TOKEN_VARIABLE => '\\$[a-z_\\x80-\\xFF][0-9a-z_\\x80-\\xFF]*+',
148:
149: // '&' followed by TOKEN_VARIADIC, TOKEN_VARIABLE, TOKEN_EQUAL, TOKEN_EQUAL or TOKEN_CLOSE_PARENTHESES
150: self::TOKEN_REFERENCE => '&(?=\\s*+(?:[.,=)]|(?:\\$(?!this(?![0-9a-z_\\x80-\\xFF])))))',
151: self::TOKEN_UNION => '\\|',
152: self::TOKEN_INTERSECTION => '&',
153: self::TOKEN_NULLABLE => '\\?',
154: self::TOKEN_NEGATED => '!',
155:
156: self::TOKEN_OPEN_PARENTHESES => '\\(',
157: self::TOKEN_CLOSE_PARENTHESES => '\\)',
158: self::TOKEN_OPEN_ANGLE_BRACKET => '<',
159: self::TOKEN_CLOSE_ANGLE_BRACKET => '>',
160: self::TOKEN_OPEN_SQUARE_BRACKET => '\\[',
161: self::TOKEN_CLOSE_SQUARE_BRACKET => '\\]',
162: self::TOKEN_OPEN_CURLY_BRACKET => '\\{',
163: self::TOKEN_CLOSE_CURLY_BRACKET => '\\}',
164:
165: self::TOKEN_COMMA => ',',
166: self::TOKEN_COMMENT => '\/\/[^\\r\\n]*(?=\n|\r|\*/)',
167: self::TOKEN_VARIADIC => '\\.\\.\\.',
168: self::TOKEN_DOUBLE_COLON => '::',
169: self::TOKEN_DOUBLE_ARROW => '=>',
170: self::TOKEN_ARROW => '->',
171: self::TOKEN_EQUAL => '=',
172: self::TOKEN_COLON => ':',
173:
174: self::TOKEN_OPEN_PHPDOC => '/\\*\\*(?=\\s)\\x20?+',
175: self::TOKEN_CLOSE_PHPDOC => '\\*/',
176: self::TOKEN_PHPDOC_TAG => '@(?:[a-z][a-z0-9-\\\\]+:)?[a-z][a-z0-9-\\\\]*+',
177: self::TOKEN_DOCTRINE_TAG => '@[a-z_\\\\][a-z0-9_\:\\\\]*[a-z_][a-z0-9_]*',
178: self::TOKEN_PHPDOC_EOL => '\\r?+\\n[\\x09\\x20]*+(?:\\*(?!/)\\x20?+)?',
179:
180: self::TOKEN_FLOAT => '[+\-]?(?:(?:[0-9]++(_[0-9]++)*\\.[0-9]*+(_[0-9]++)*(?:e[+\-]?[0-9]++(_[0-9]++)*)?)|(?:[0-9]*+(_[0-9]++)*\\.[0-9]++(_[0-9]++)*(?:e[+\-]?[0-9]++(_[0-9]++)*)?)|(?:[0-9]++(_[0-9]++)*e[+\-]?[0-9]++(_[0-9]++)*))',
181: self::TOKEN_INTEGER => '[+\-]?(?:(?:0b[0-1]++(_[0-1]++)*)|(?:0o[0-7]++(_[0-7]++)*)|(?:0x[0-9a-f]++(_[0-9a-f]++)*)|(?:[0-9]++(_[0-9]++)*))',
182: self::TOKEN_SINGLE_QUOTED_STRING => '\'(?:\\\\[^\\r\\n]|[^\'\\r\\n\\\\])*+\'',
183: self::TOKEN_DOUBLE_QUOTED_STRING => '"(?:\\\\[^\\r\\n]|[^"\\r\\n\\\\])*+"',
184: self::TOKEN_DOCTRINE_ANNOTATION_STRING => '"(?:""|[^"])*+"',
185:
186: self::TOKEN_WILDCARD => '\\*',
187:
188: // anything but TOKEN_CLOSE_PHPDOC or TOKEN_HORIZONTAL_WS or TOKEN_EOL
189: self::TOKEN_OTHER => '(?:(?!\\*/)[^\\s])++',
190: ];
191:
192: foreach ($patterns as $type => &$pattern) {
193: $pattern = '(?:' . $pattern . ')(*MARK:' . $type . ')';
194: }
195:
196: return '~' . implode('|', $patterns) . '~Asi';
197: }
198:
199: }
200: