1: <?php declare(strict_types = 1);
2:
3: namespace PHPStan\PhpDocParser\Lexer;
4:
5: use PHPStan\PhpDocParser\ParserConfig;
6: use function implode;
7: use function preg_match_all;
8: use const PREG_SET_ORDER;
9:
10: /**
11: * Implementation based on Nette Tokenizer (New BSD License; https://github.com/nette/tokenizer)
12: */
13: class Lexer
14: {
15:
16: public const TOKEN_REFERENCE = 0;
17: public const TOKEN_UNION = 1;
18: public const TOKEN_INTERSECTION = 2;
19: public const TOKEN_NULLABLE = 3;
20: public const TOKEN_OPEN_PARENTHESES = 4;
21: public const TOKEN_CLOSE_PARENTHESES = 5;
22: public const TOKEN_OPEN_ANGLE_BRACKET = 6;
23: public const TOKEN_CLOSE_ANGLE_BRACKET = 7;
24: public const TOKEN_OPEN_SQUARE_BRACKET = 8;
25: public const TOKEN_CLOSE_SQUARE_BRACKET = 9;
26: public const TOKEN_COMMA = 10;
27: public const TOKEN_VARIADIC = 11;
28: public const TOKEN_DOUBLE_COLON = 12;
29: public const TOKEN_DOUBLE_ARROW = 13;
30: public const TOKEN_EQUAL = 14;
31: public const TOKEN_OPEN_PHPDOC = 15;
32: public const TOKEN_CLOSE_PHPDOC = 16;
33: public const TOKEN_PHPDOC_TAG = 17;
34: public const TOKEN_DOCTRINE_TAG = 18;
35: public const TOKEN_FLOAT = 19;
36: public const TOKEN_INTEGER = 20;
37: public const TOKEN_SINGLE_QUOTED_STRING = 21;
38: public const TOKEN_DOUBLE_QUOTED_STRING = 22;
39: public const TOKEN_DOCTRINE_ANNOTATION_STRING = 23;
40: public const TOKEN_IDENTIFIER = 24;
41: public const TOKEN_THIS_VARIABLE = 25;
42: public const TOKEN_VARIABLE = 26;
43: public const TOKEN_HORIZONTAL_WS = 27;
44: public const TOKEN_PHPDOC_EOL = 28;
45: public const TOKEN_OTHER = 29;
46: public const TOKEN_END = 30;
47: public const TOKEN_COLON = 31;
48: public const TOKEN_WILDCARD = 32;
49: public const TOKEN_OPEN_CURLY_BRACKET = 33;
50: public const TOKEN_CLOSE_CURLY_BRACKET = 34;
51: public const TOKEN_NEGATED = 35;
52: public const TOKEN_ARROW = 36;
53:
54: public const TOKEN_COMMENT = 37;
55:
56: public const TOKEN_LABELS = [
57: self::TOKEN_REFERENCE => '\'&\'',
58: self::TOKEN_UNION => '\'|\'',
59: self::TOKEN_INTERSECTION => '\'&\'',
60: self::TOKEN_NULLABLE => '\'?\'',
61: self::TOKEN_NEGATED => '\'!\'',
62: self::TOKEN_OPEN_PARENTHESES => '\'(\'',
63: self::TOKEN_CLOSE_PARENTHESES => '\')\'',
64: self::TOKEN_OPEN_ANGLE_BRACKET => '\'<\'',
65: self::TOKEN_CLOSE_ANGLE_BRACKET => '\'>\'',
66: self::TOKEN_OPEN_SQUARE_BRACKET => '\'[\'',
67: self::TOKEN_CLOSE_SQUARE_BRACKET => '\']\'',
68: self::TOKEN_OPEN_CURLY_BRACKET => '\'{\'',
69: self::TOKEN_CLOSE_CURLY_BRACKET => '\'}\'',
70: self::TOKEN_COMMA => '\',\'',
71: self::TOKEN_COMMENT => '\'//\'',
72: self::TOKEN_COLON => '\':\'',
73: self::TOKEN_VARIADIC => '\'...\'',
74: self::TOKEN_DOUBLE_COLON => '\'::\'',
75: self::TOKEN_DOUBLE_ARROW => '\'=>\'',
76: self::TOKEN_ARROW => '\'->\'',
77: self::TOKEN_EQUAL => '\'=\'',
78: self::TOKEN_OPEN_PHPDOC => '\'/**\'',
79: self::TOKEN_CLOSE_PHPDOC => '\'*/\'',
80: self::TOKEN_PHPDOC_TAG => 'TOKEN_PHPDOC_TAG',
81: self::TOKEN_DOCTRINE_TAG => 'TOKEN_DOCTRINE_TAG',
82: self::TOKEN_PHPDOC_EOL => 'TOKEN_PHPDOC_EOL',
83: self::TOKEN_FLOAT => 'TOKEN_FLOAT',
84: self::TOKEN_INTEGER => 'TOKEN_INTEGER',
85: self::TOKEN_SINGLE_QUOTED_STRING => 'TOKEN_SINGLE_QUOTED_STRING',
86: self::TOKEN_DOUBLE_QUOTED_STRING => 'TOKEN_DOUBLE_QUOTED_STRING',
87: self::TOKEN_DOCTRINE_ANNOTATION_STRING => 'TOKEN_DOCTRINE_ANNOTATION_STRING',
88: self::TOKEN_IDENTIFIER => 'type',
89: self::TOKEN_THIS_VARIABLE => '\'$this\'',
90: self::TOKEN_VARIABLE => 'variable',
91: self::TOKEN_HORIZONTAL_WS => 'TOKEN_HORIZONTAL_WS',
92: self::TOKEN_OTHER => 'TOKEN_OTHER',
93: self::TOKEN_END => 'TOKEN_END',
94: self::TOKEN_WILDCARD => '*',
95: ];
96:
97: public const VALUE_OFFSET = 0;
98: public const TYPE_OFFSET = 1;
99: public const LINE_OFFSET = 2;
100:
101: private ParserConfig $config; // @phpstan-ignore property.onlyWritten
102:
103: private ?string $regexp = null;
104:
105: public function __construct(ParserConfig $config)
106: {
107: $this->config = $config;
108: }
109:
110: /**
111: * @return list<array{string, int, int}>
112: */
113: public function tokenize(string $s): array
114: {
115: if ($this->regexp === null) {
116: $this->regexp = $this->generateRegexp();
117: }
118:
119: preg_match_all($this->regexp, $s, $matches, PREG_SET_ORDER);
120:
121: $tokens = [];
122: $line = 1;
123: foreach ($matches as $match) {
124: $type = (int) $match['MARK'];
125: $tokens[] = [$match[0], $type, $line];
126: if ($type !== self::TOKEN_PHPDOC_EOL) {
127: continue;
128: }
129:
130: $line++;
131: }
132:
133: $tokens[] = ['', self::TOKEN_END, $line];
134:
135: return $tokens;
136: }
137:
138: private function generateRegexp(): string
139: {
140: $patterns = [
141: self::TOKEN_HORIZONTAL_WS => '[\\x09\\x20]++',
142:
143: self::TOKEN_IDENTIFIER => '(?:[\\\\]?+[a-z_\\x80-\\xFF][0-9a-z_\\x80-\\xFF-]*+)++',
144: self::TOKEN_THIS_VARIABLE => '\\$this(?![0-9a-z_\\x80-\\xFF])',
145: self::TOKEN_VARIABLE => '\\$[a-z_\\x80-\\xFF][0-9a-z_\\x80-\\xFF]*+',
146:
147: // '&' followed by TOKEN_VARIADIC, TOKEN_VARIABLE, TOKEN_EQUAL, TOKEN_EQUAL or TOKEN_CLOSE_PARENTHESES
148: self::TOKEN_REFERENCE => '&(?=\\s*+(?:[.,=)]|(?:\\$(?!this(?![0-9a-z_\\x80-\\xFF])))))',
149: self::TOKEN_UNION => '\\|',
150: self::TOKEN_INTERSECTION => '&',
151: self::TOKEN_NULLABLE => '\\?',
152: self::TOKEN_NEGATED => '!',
153:
154: self::TOKEN_OPEN_PARENTHESES => '\\(',
155: self::TOKEN_CLOSE_PARENTHESES => '\\)',
156: self::TOKEN_OPEN_ANGLE_BRACKET => '<',
157: self::TOKEN_CLOSE_ANGLE_BRACKET => '>',
158: self::TOKEN_OPEN_SQUARE_BRACKET => '\\[',
159: self::TOKEN_CLOSE_SQUARE_BRACKET => '\\]',
160: self::TOKEN_OPEN_CURLY_BRACKET => '\\{',
161: self::TOKEN_CLOSE_CURLY_BRACKET => '\\}',
162:
163: self::TOKEN_COMMA => ',',
164: self::TOKEN_COMMENT => '\/\/[^\\r\\n]*(?=\n|\r|\*/)',
165: self::TOKEN_VARIADIC => '\\.\\.\\.',
166: self::TOKEN_DOUBLE_COLON => '::',
167: self::TOKEN_DOUBLE_ARROW => '=>',
168: self::TOKEN_ARROW => '->',
169: self::TOKEN_EQUAL => '=',
170: self::TOKEN_COLON => ':',
171:
172: self::TOKEN_OPEN_PHPDOC => '/\\*\\*(?=\\s)\\x20?+',
173: self::TOKEN_CLOSE_PHPDOC => '\\*/',
174: self::TOKEN_PHPDOC_TAG => '@(?:[a-z][a-z0-9-\\\\]+:)?[a-z][a-z0-9-\\\\]*+',
175: self::TOKEN_DOCTRINE_TAG => '@[a-z_\\\\][a-z0-9_\:\\\\]*[a-z_][a-z0-9_]*',
176: self::TOKEN_PHPDOC_EOL => '\\r?+\\n[\\x09\\x20]*+(?:\\*(?!/)\\x20?+)?',
177:
178: self::TOKEN_FLOAT => '[+\-]?(?:(?:[0-9]++(_[0-9]++)*\\.[0-9]*+(_[0-9]++)*(?:e[+\-]?[0-9]++(_[0-9]++)*)?)|(?:[0-9]*+(_[0-9]++)*\\.[0-9]++(_[0-9]++)*(?:e[+\-]?[0-9]++(_[0-9]++)*)?)|(?:[0-9]++(_[0-9]++)*e[+\-]?[0-9]++(_[0-9]++)*))',
179: self::TOKEN_INTEGER => '[+\-]?(?:(?:0b[0-1]++(_[0-1]++)*)|(?:0o[0-7]++(_[0-7]++)*)|(?:0x[0-9a-f]++(_[0-9a-f]++)*)|(?:[0-9]++(_[0-9]++)*))',
180: self::TOKEN_SINGLE_QUOTED_STRING => '\'(?:\\\\[^\\r\\n]|[^\'\\r\\n\\\\])*+\'',
181: self::TOKEN_DOUBLE_QUOTED_STRING => '"(?:\\\\[^\\r\\n]|[^"\\r\\n\\\\])*+"',
182: self::TOKEN_DOCTRINE_ANNOTATION_STRING => '"(?:""|[^"])*+"',
183:
184: self::TOKEN_WILDCARD => '\\*',
185:
186: // anything but TOKEN_CLOSE_PHPDOC or TOKEN_HORIZONTAL_WS or TOKEN_EOL
187: self::TOKEN_OTHER => '(?:(?!\\*/)[^\\s])++',
188: ];
189:
190: foreach ($patterns as $type => &$pattern) {
191: $pattern = '(?:' . $pattern . ')(*MARK:' . $type . ')';
192: }
193:
194: return '~' . implode('|', $patterns) . '~Asi';
195: }
196:
197: }
198: