1: <?php declare(strict_types = 1);
2:
3: namespace PHPStan\PhpDocParser\Lexer;
4:
5: use PHPStan\PhpDocParser\ParserConfig;
6: use function implode;
7: use function preg_match_all;
8: use const PREG_SET_ORDER;
9:
10: /**
11: * Implementation based on Nette Tokenizer (New BSD License; https://github.com/nette/tokenizer)
12: */
13: class Lexer
14: {
15:
16: public const TOKEN_REFERENCE = 0;
17: public const TOKEN_UNION = 1;
18: public const TOKEN_INTERSECTION = 2;
19: public const TOKEN_NULLABLE = 3;
20: public const TOKEN_OPEN_PARENTHESES = 4;
21: public const TOKEN_CLOSE_PARENTHESES = 5;
22: public const TOKEN_OPEN_ANGLE_BRACKET = 6;
23: public const TOKEN_CLOSE_ANGLE_BRACKET = 7;
24: public const TOKEN_OPEN_SQUARE_BRACKET = 8;
25: public const TOKEN_CLOSE_SQUARE_BRACKET = 9;
26: public const TOKEN_COMMA = 10;
27: public const TOKEN_VARIADIC = 11;
28: public const TOKEN_DOUBLE_COLON = 12;
29: public const TOKEN_DOUBLE_ARROW = 13;
30: public const TOKEN_EQUAL = 14;
31: public const TOKEN_OPEN_PHPDOC = 15;
32: public const TOKEN_CLOSE_PHPDOC = 16;
33: public const TOKEN_PHPDOC_TAG = 17;
34: public const TOKEN_DOCTRINE_TAG = 18;
35: public const TOKEN_FLOAT = 19;
36: public const TOKEN_INTEGER = 20;
37: public const TOKEN_SINGLE_QUOTED_STRING = 21;
38: public const TOKEN_DOUBLE_QUOTED_STRING = 22;
39: public const TOKEN_DOCTRINE_ANNOTATION_STRING = 23;
40: public const TOKEN_IDENTIFIER = 24;
41: public const TOKEN_THIS_VARIABLE = 25;
42: public const TOKEN_VARIABLE = 26;
43: public const TOKEN_HORIZONTAL_WS = 27;
44: public const TOKEN_PHPDOC_EOL = 28;
45: public const TOKEN_OTHER = 29;
46: public const TOKEN_END = 30;
47: public const TOKEN_COLON = 31;
48: public const TOKEN_WILDCARD = 32;
49: public const TOKEN_OPEN_CURLY_BRACKET = 33;
50: public const TOKEN_CLOSE_CURLY_BRACKET = 34;
51: public const TOKEN_NEGATED = 35;
52: public const TOKEN_ARROW = 36;
53:
54: public const TOKEN_LABELS = [
55: self::TOKEN_REFERENCE => '\'&\'',
56: self::TOKEN_UNION => '\'|\'',
57: self::TOKEN_INTERSECTION => '\'&\'',
58: self::TOKEN_NULLABLE => '\'?\'',
59: self::TOKEN_NEGATED => '\'!\'',
60: self::TOKEN_OPEN_PARENTHESES => '\'(\'',
61: self::TOKEN_CLOSE_PARENTHESES => '\')\'',
62: self::TOKEN_OPEN_ANGLE_BRACKET => '\'<\'',
63: self::TOKEN_CLOSE_ANGLE_BRACKET => '\'>\'',
64: self::TOKEN_OPEN_SQUARE_BRACKET => '\'[\'',
65: self::TOKEN_CLOSE_SQUARE_BRACKET => '\']\'',
66: self::TOKEN_OPEN_CURLY_BRACKET => '\'{\'',
67: self::TOKEN_CLOSE_CURLY_BRACKET => '\'}\'',
68: self::TOKEN_COMMA => '\',\'',
69: self::TOKEN_COLON => '\':\'',
70: self::TOKEN_VARIADIC => '\'...\'',
71: self::TOKEN_DOUBLE_COLON => '\'::\'',
72: self::TOKEN_DOUBLE_ARROW => '\'=>\'',
73: self::TOKEN_ARROW => '\'->\'',
74: self::TOKEN_EQUAL => '\'=\'',
75: self::TOKEN_OPEN_PHPDOC => '\'/**\'',
76: self::TOKEN_CLOSE_PHPDOC => '\'*/\'',
77: self::TOKEN_PHPDOC_TAG => 'TOKEN_PHPDOC_TAG',
78: self::TOKEN_DOCTRINE_TAG => 'TOKEN_DOCTRINE_TAG',
79: self::TOKEN_PHPDOC_EOL => 'TOKEN_PHPDOC_EOL',
80: self::TOKEN_FLOAT => 'TOKEN_FLOAT',
81: self::TOKEN_INTEGER => 'TOKEN_INTEGER',
82: self::TOKEN_SINGLE_QUOTED_STRING => 'TOKEN_SINGLE_QUOTED_STRING',
83: self::TOKEN_DOUBLE_QUOTED_STRING => 'TOKEN_DOUBLE_QUOTED_STRING',
84: self::TOKEN_DOCTRINE_ANNOTATION_STRING => 'TOKEN_DOCTRINE_ANNOTATION_STRING',
85: self::TOKEN_IDENTIFIER => 'type',
86: self::TOKEN_THIS_VARIABLE => '\'$this\'',
87: self::TOKEN_VARIABLE => 'variable',
88: self::TOKEN_HORIZONTAL_WS => 'TOKEN_HORIZONTAL_WS',
89: self::TOKEN_OTHER => 'TOKEN_OTHER',
90: self::TOKEN_END => 'TOKEN_END',
91: self::TOKEN_WILDCARD => '*',
92: ];
93:
94: public const VALUE_OFFSET = 0;
95: public const TYPE_OFFSET = 1;
96: public const LINE_OFFSET = 2;
97:
98: private ParserConfig $config; // @phpstan-ignore property.onlyWritten
99:
100: private ?string $regexp = null;
101:
102: public function __construct(ParserConfig $config)
103: {
104: $this->config = $config;
105: }
106:
107:
108: /**
109: * @return list<array{string, int, int}>
110: */
111: public function tokenize(string $s): array
112: {
113: if ($this->regexp === null) {
114: $this->regexp = $this->generateRegexp();
115: }
116:
117: preg_match_all($this->regexp, $s, $matches, PREG_SET_ORDER);
118:
119: $tokens = [];
120: $line = 1;
121: foreach ($matches as $match) {
122: $type = (int) $match['MARK'];
123: $tokens[] = [$match[0], $type, $line];
124: if ($type !== self::TOKEN_PHPDOC_EOL) {
125: continue;
126: }
127:
128: $line++;
129: }
130:
131: $tokens[] = ['', self::TOKEN_END, $line];
132:
133: return $tokens;
134: }
135:
136:
137: private function generateRegexp(): string
138: {
139: $patterns = [
140: self::TOKEN_HORIZONTAL_WS => '[\\x09\\x20]++',
141:
142: self::TOKEN_IDENTIFIER => '(?:[\\\\]?+[a-z_\\x80-\\xFF][0-9a-z_\\x80-\\xFF-]*+)++',
143: self::TOKEN_THIS_VARIABLE => '\\$this(?![0-9a-z_\\x80-\\xFF])',
144: self::TOKEN_VARIABLE => '\\$[a-z_\\x80-\\xFF][0-9a-z_\\x80-\\xFF]*+',
145:
146: // '&' followed by TOKEN_VARIADIC, TOKEN_VARIABLE, TOKEN_EQUAL, TOKEN_EQUAL or TOKEN_CLOSE_PARENTHESES
147: self::TOKEN_REFERENCE => '&(?=\\s*+(?:[.,=)]|(?:\\$(?!this(?![0-9a-z_\\x80-\\xFF])))))',
148: self::TOKEN_UNION => '\\|',
149: self::TOKEN_INTERSECTION => '&',
150: self::TOKEN_NULLABLE => '\\?',
151: self::TOKEN_NEGATED => '!',
152:
153: self::TOKEN_OPEN_PARENTHESES => '\\(',
154: self::TOKEN_CLOSE_PARENTHESES => '\\)',
155: self::TOKEN_OPEN_ANGLE_BRACKET => '<',
156: self::TOKEN_CLOSE_ANGLE_BRACKET => '>',
157: self::TOKEN_OPEN_SQUARE_BRACKET => '\\[',
158: self::TOKEN_CLOSE_SQUARE_BRACKET => '\\]',
159: self::TOKEN_OPEN_CURLY_BRACKET => '\\{',
160: self::TOKEN_CLOSE_CURLY_BRACKET => '\\}',
161:
162: self::TOKEN_COMMA => ',',
163: self::TOKEN_VARIADIC => '\\.\\.\\.',
164: self::TOKEN_DOUBLE_COLON => '::',
165: self::TOKEN_DOUBLE_ARROW => '=>',
166: self::TOKEN_ARROW => '->',
167: self::TOKEN_EQUAL => '=',
168: self::TOKEN_COLON => ':',
169:
170: self::TOKEN_OPEN_PHPDOC => '/\\*\\*(?=\\s)\\x20?+',
171: self::TOKEN_CLOSE_PHPDOC => '\\*/',
172: self::TOKEN_PHPDOC_TAG => '@(?:[a-z][a-z0-9-\\\\]+:)?[a-z][a-z0-9-\\\\]*+',
173: self::TOKEN_DOCTRINE_TAG => '@[a-z_\\\\][a-z0-9_\:\\\\]*[a-z_][a-z0-9_]*',
174: self::TOKEN_PHPDOC_EOL => '\\r?+\\n[\\x09\\x20]*+(?:\\*(?!/)\\x20?+)?',
175:
176: self::TOKEN_FLOAT => '[+\-]?(?:(?:[0-9]++(_[0-9]++)*\\.[0-9]*+(_[0-9]++)*(?:e[+\-]?[0-9]++(_[0-9]++)*)?)|(?:[0-9]*+(_[0-9]++)*\\.[0-9]++(_[0-9]++)*(?:e[+\-]?[0-9]++(_[0-9]++)*)?)|(?:[0-9]++(_[0-9]++)*e[+\-]?[0-9]++(_[0-9]++)*))',
177: self::TOKEN_INTEGER => '[+\-]?(?:(?:0b[0-1]++(_[0-1]++)*)|(?:0o[0-7]++(_[0-7]++)*)|(?:0x[0-9a-f]++(_[0-9a-f]++)*)|(?:[0-9]++(_[0-9]++)*))',
178: self::TOKEN_SINGLE_QUOTED_STRING => '\'(?:\\\\[^\\r\\n]|[^\'\\r\\n\\\\])*+\'',
179: self::TOKEN_DOUBLE_QUOTED_STRING => '"(?:\\\\[^\\r\\n]|[^"\\r\\n\\\\])*+"',
180: self::TOKEN_DOCTRINE_ANNOTATION_STRING => '"(?:""|[^"])*+"',
181:
182: self::TOKEN_WILDCARD => '\\*',
183:
184: // anything but TOKEN_CLOSE_PHPDOC or TOKEN_HORIZONTAL_WS or TOKEN_EOL
185: self::TOKEN_OTHER => '(?:(?!\\*/)[^\\s])++',
186: ];
187:
188: foreach ($patterns as $type => &$pattern) {
189: $pattern = '(?:' . $pattern . ')(*MARK:' . $type . ')';
190: }
191:
192: return '~' . implode('|', $patterns) . '~Asi';
193: }
194:
195: }
196: