blob: e9980b36b97bb0bfd29a643a6b9289bcbb4aff1f [file] [log] [blame]
Chris Lattnera8058742007-11-18 02:57:27 +00001//===- TGLexer.h - Lexer for TableGen Files ---------------------*- C++ -*-===//
2//
3// The LLVM Compiler Infrastructure
4//
Chris Lattner30609102007-12-29 20:37:13 +00005// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
Chris Lattnera8058742007-11-18 02:57:27 +00007//
8//===----------------------------------------------------------------------===//
9//
10// This class represents the Lexer for tablegen files.
11//
12//===----------------------------------------------------------------------===//
13
Benjamin Kramer00e08fc2014-08-13 16:26:38 +000014#ifndef LLVM_LIB_TABLEGEN_TGLEXER_H
15#define LLVM_LIB_TABLEGEN_TGLEXER_H
Chris Lattnera8058742007-11-18 02:57:27 +000016
Vyacheslav Zakharin6c99d2b2018-11-27 18:57:43 +000017#include "llvm/ADT/ArrayRef.h"
Rafael Espindola245fbdf2014-07-06 14:24:03 +000018#include "llvm/ADT/StringRef.h"
Vyacheslav Zakharin6c99d2b2018-11-27 18:57:43 +000019#include "llvm/ADT/StringSet.h"
Michael J. Spencer3cc52ea2010-11-29 18:47:54 +000020#include "llvm/Support/DataTypes.h"
Sean Silvaa170f522013-02-07 04:30:39 +000021#include "llvm/Support/SMLoc.h"
Chandler Carrutha1514e22012-12-04 07:12:27 +000022#include <cassert>
Sean Silvaa170f522013-02-07 04:30:39 +000023#include <map>
Vyacheslav Zakharin6c99d2b2018-11-27 18:57:43 +000024#include <memory>
Chris Lattnera8058742007-11-18 02:57:27 +000025#include <string>
Chris Lattnera8058742007-11-18 02:57:27 +000026
27namespace llvm {
Chris Lattner8070ea32009-06-21 03:41:50 +000028class SourceMgr;
Chris Lattner1e3a8a42009-06-21 03:39:35 +000029class SMLoc;
Benjamin Kramerd1e17032010-09-27 17:42:11 +000030class Twine;
31
Chris Lattnerf4601652007-11-22 20:49:04 +000032namespace tgtok {
33 enum TokKind {
34 // Markers
35 Eof, Error,
Nicolai Haehnle9ae21b32018-03-09 18:32:04 +000036
Chris Lattnerf4601652007-11-22 20:49:04 +000037 // Tokens with no info.
38 minus, plus, // - +
39 l_square, r_square, // [ ]
40 l_brace, r_brace, // { }
41 l_paren, r_paren, // ( )
42 less, greater, // < >
Francois Pichet606957f2011-03-14 02:30:32 +000043 colon, semi, // : ;
Chris Lattnerf4601652007-11-22 20:49:04 +000044 comma, period, // , .
45 equal, question, // = ?
David Greened3d1cad2011-10-19 13:04:43 +000046 paste, // #
47
Chris Lattnerf4601652007-11-22 20:49:04 +000048 // Keywords.
David Greenecebb4ee2012-02-22 16:09:41 +000049 Bit, Bits, Class, Code, Dag, Def, Foreach, Defm, Field, In, Int, Let, List,
Nicolai Haehnled66fa2a2018-03-09 12:24:42 +000050 MultiClass, String, Defset,
Matt Arsenaultee233182016-11-15 06:49:28 +000051
Chris Lattnerf4601652007-11-22 20:49:04 +000052 // !keywords.
Matt Arsenaultee233182016-11-15 06:49:28 +000053 XConcat, XADD, XAND, XOR, XSRA, XSRL, XSHL, XListConcat, XStrConcat, XCast,
Nicolai Haehnle23187642018-03-14 11:00:26 +000054 XSubst, XForEach, XFoldl, XHead, XTail, XSize, XEmpty, XIf, XEq, XIsA, XDag,
Nicolai Haehnleaf0de502018-03-14 11:00:57 +000055 XNe, XLe, XLt, XGe, XGt,
David Greenec7cafcd2009-04-22 20:18:10 +000056
Chris Lattnerf4601652007-11-22 20:49:04 +000057 // Integer value.
58 IntVal,
Pete Cooper42c12272014-08-07 05:47:00 +000059
60 // Binary constant. Note that these are sized according to the number of
61 // bits given.
62 BinaryIntVal,
Nicolai Haehnle9ae21b32018-03-09 18:32:04 +000063
Chris Lattnerf4601652007-11-22 20:49:04 +000064 // String valued tokens.
Vyacheslav Zakharin6c99d2b2018-11-27 18:57:43 +000065 Id, StrVal, VarName, CodeFragment,
66
67 // Preprocessing tokens for internal usage by the lexer.
68 // They are never returned as a result of Lex().
69 Ifdef, Else, Endif, Define
Chris Lattnerf4601652007-11-22 20:49:04 +000070 };
Alexander Kornienkocd52a7a2015-06-23 09:49:53 +000071}
Chris Lattnera8058742007-11-18 02:57:27 +000072
Chris Lattnerf4601652007-11-22 20:49:04 +000073/// TGLexer - TableGen Lexer class.
Chris Lattnera8058742007-11-18 02:57:27 +000074class TGLexer {
Chris Lattner8070ea32009-06-21 03:41:50 +000075 SourceMgr &SrcMgr;
Nicolai Haehnle9ae21b32018-03-09 18:32:04 +000076
Chris Lattnera8058742007-11-18 02:57:27 +000077 const char *CurPtr;
Rafael Espindola245fbdf2014-07-06 14:24:03 +000078 StringRef CurBuf;
Chris Lattnera8058742007-11-18 02:57:27 +000079
Chris Lattnerf4601652007-11-22 20:49:04 +000080 // Information about the current token.
81 const char *TokStart;
82 tgtok::TokKind CurCode;
83 std::string CurStrVal; // This is valid for ID, STRVAL, VARNAME, CODEFRAGMENT
Dan Gohman63f97202008-10-17 01:33:43 +000084 int64_t CurIntVal; // This is valid for INTVAL.
Chris Lattneraa739d22009-03-13 07:05:43 +000085
86 /// CurBuffer - This is the current buffer index we're lexing from as managed
87 /// by the SourceMgr object.
Alp Toker1508c822014-07-06 10:33:31 +000088 unsigned CurBuffer;
Sean Silvaa170f522013-02-07 04:30:39 +000089
90public:
91 typedef std::map<std::string, SMLoc> DependenciesMapTy;
92private:
Joerg Sonnenbergerdd137902011-06-01 13:10:15 +000093 /// Dependencies - This is the list of all included files.
Sean Silvaa170f522013-02-07 04:30:39 +000094 DependenciesMapTy Dependencies;
95
Chris Lattnera8058742007-11-18 02:57:27 +000096public:
Vyacheslav Zakharin6c99d2b2018-11-27 18:57:43 +000097 TGLexer(SourceMgr &SrcMgr, ArrayRef<std::string> Macros);
Benjamin Kramer829e0132015-04-11 15:32:26 +000098
Chris Lattnerf4601652007-11-22 20:49:04 +000099 tgtok::TokKind Lex() {
Vyacheslav Zakharin6c99d2b2018-11-27 18:57:43 +0000100 return CurCode = LexToken(CurPtr == CurBuf.begin());
Chris Lattnerf4601652007-11-22 20:49:04 +0000101 }
Joerg Sonnenbergerdd137902011-06-01 13:10:15 +0000102
Sean Silvaa170f522013-02-07 04:30:39 +0000103 const DependenciesMapTy &getDependencies() const {
Joerg Sonnenbergerdd137902011-06-01 13:10:15 +0000104 return Dependencies;
105 }
Nicolai Haehnle9ae21b32018-03-09 18:32:04 +0000106
Chris Lattnerf4601652007-11-22 20:49:04 +0000107 tgtok::TokKind getCode() const { return CurCode; }
108
109 const std::string &getCurStrVal() const {
Nicolai Haehnle9ae21b32018-03-09 18:32:04 +0000110 assert((CurCode == tgtok::Id || CurCode == tgtok::StrVal ||
Chris Lattnerf4601652007-11-22 20:49:04 +0000111 CurCode == tgtok::VarName || CurCode == tgtok::CodeFragment) &&
112 "This token doesn't have a string value");
113 return CurStrVal;
114 }
Dan Gohman63f97202008-10-17 01:33:43 +0000115 int64_t getCurIntVal() const {
Chris Lattnerf4601652007-11-22 20:49:04 +0000116 assert(CurCode == tgtok::IntVal && "This token isn't an integer");
117 return CurIntVal;
118 }
Pete Cooper42c12272014-08-07 05:47:00 +0000119 std::pair<int64_t, unsigned> getCurBinaryIntVal() const {
120 assert(CurCode == tgtok::BinaryIntVal &&
121 "This token isn't a binary integer");
122 return std::make_pair(CurIntVal, (CurPtr - TokStart)-2);
123 }
Chris Lattnerf4601652007-11-22 20:49:04 +0000124
Chris Lattner1e3a8a42009-06-21 03:39:35 +0000125 SMLoc getLoc() const;
Nicolai Haehnle9ae21b32018-03-09 18:32:04 +0000126
Chris Lattnera8058742007-11-18 02:57:27 +0000127private:
Chris Lattnerf4601652007-11-22 20:49:04 +0000128 /// LexToken - Read the next token and return its code.
Vyacheslav Zakharin6c99d2b2018-11-27 18:57:43 +0000129 tgtok::TokKind LexToken(bool FileOrLineStart = false);
Nicolai Haehnle9ae21b32018-03-09 18:32:04 +0000130
Vyacheslav Zakharin6c99d2b2018-11-27 18:57:43 +0000131 tgtok::TokKind ReturnError(SMLoc Loc, const Twine &Msg);
Benjamin Kramerd1e17032010-09-27 17:42:11 +0000132 tgtok::TokKind ReturnError(const char *Loc, const Twine &Msg);
Nicolai Haehnle9ae21b32018-03-09 18:32:04 +0000133
Chris Lattnera8058742007-11-18 02:57:27 +0000134 int getNextChar();
Vyacheslav Zakharin6c99d2b2018-11-27 18:57:43 +0000135 int peekNextChar(int Index) const;
Chris Lattnera8058742007-11-18 02:57:27 +0000136 void SkipBCPLComment();
137 bool SkipCComment();
Chris Lattnerf4601652007-11-22 20:49:04 +0000138 tgtok::TokKind LexIdentifier();
Chris Lattnera8058742007-11-18 02:57:27 +0000139 bool LexInclude();
Chris Lattnerf4601652007-11-22 20:49:04 +0000140 tgtok::TokKind LexString();
141 tgtok::TokKind LexVarName();
142 tgtok::TokKind LexNumber();
143 tgtok::TokKind LexBracket();
144 tgtok::TokKind LexExclaim();
Vyacheslav Zakharin6c99d2b2018-11-27 18:57:43 +0000145
146 // Process EOF encountered in LexToken().
147 // If EOF is met in an include file, then the method will update
148 // CurPtr, CurBuf and preprocessing include stack, and return true.
149 // If EOF is met in the top-level file, then the method will
150 // update and check the preprocessing include stack, and return false.
151 bool processEOF();
152
153 // *** Structures and methods for preprocessing support ***
154
155 // A set of macro names that are defined either via command line or
156 // by using:
157 // #define NAME
158 StringSet<> DefinedMacros;
159
160 // Each of #ifdef and #else directives has a descriptor associated
161 // with it.
162 //
163 // An ordered list of preprocessing controls defined by #ifdef/#else
164 // directives that are in effect currently is called preprocessing
165 // control stack. It is represented as a vector of PreprocessorControlDesc's.
166 //
167 // The control stack is updated according to the following rules:
168 //
169 // For each #ifdef we add an element to the control stack.
170 // For each #else we replace the top element with a descriptor
171 // with an inverted IsDefined value.
172 // For each #endif we pop the top element from the control stack.
173 //
174 // When CurPtr reaches the current buffer's end, the control stack
175 // must be empty, i.e. #ifdef and the corresponding #endif
176 // must be located in the same file.
177 struct PreprocessorControlDesc {
178 // Either tgtok::Ifdef or tgtok::Else.
179 tgtok::TokKind Kind;
180
181 // True, if the condition for this directive is true, false - otherwise.
182 // Examples:
183 // #ifdef NAME : true, if NAME is defined, false - otherwise.
184 // ...
185 // #else : false, if NAME is defined, true - otherwise.
186 bool IsDefined;
187
188 // Pointer into CurBuf to the beginning of the preprocessing directive
189 // word, e.g.:
190 // #ifdef NAME
191 // ^ - SrcPos
192 SMLoc SrcPos;
193 };
194
195 // We want to disallow code like this:
196 // file1.td:
197 // #define NAME
198 // #ifdef NAME
199 // include "file2.td"
200 // EOF
201 // file2.td:
202 // #endif
203 // EOF
204 //
205 // To do this, we clear the preprocessing control stack on entry
206 // to each of the included file. PrepIncludeStack is used to store
207 // preprocessing control stacks for the current file and all its
208 // parent files. The back() element is the preprocessing control
209 // stack for the current file.
210 std::vector<std::unique_ptr<std::vector<PreprocessorControlDesc>>>
211 PrepIncludeStack;
212
213 // Validate that the current preprocessing control stack is empty,
214 // since we are about to exit a file, and pop the include stack.
215 //
216 // If IncludeStackMustBeEmpty is true, the include stack must be empty
217 // after the popping, otherwise, the include stack must not be empty
218 // after the popping. Basically, the include stack must be empty
219 // only if we exit the "top-level" file (i.e. finish lexing).
220 //
221 // The method returns false, if the current preprocessing control stack
222 // is not empty (e.g. there is an unterminated #ifdef/#else),
223 // true - otherwise.
224 bool prepExitInclude(bool IncludeStackMustBeEmpty);
225
226 // Look ahead for a preprocessing directive starting from CurPtr. The caller
227 // must only call this method, if *(CurPtr - 1) is '#'. If the method matches
228 // a preprocessing directive word followed by a whitespace, then it returns
229 // one of the internal token kinds, i.e. Ifdef, Else, Endif, Define.
230 //
231 // CurPtr is not adjusted by this method.
232 tgtok::TokKind prepIsDirective() const;
233
234 // Given a preprocessing token kind, adjusts CurPtr to the end
235 // of the preprocessing directive word. Returns true, unless
236 // an unsupported token kind is passed in.
237 //
238 // We use look-ahead prepIsDirective() and prepEatPreprocessorDirective()
239 // to avoid adjusting CurPtr before we are sure that '#' is followed
240 // by a preprocessing directive. If it is not, then we fall back to
241 // tgtok::paste interpretation of '#'.
242 bool prepEatPreprocessorDirective(tgtok::TokKind Kind);
243
244 // The main "exit" point from the token parsing to preprocessor.
245 //
246 // The method is called for CurPtr, when prepIsDirective() returns
247 // true. The first parameter matches the result of prepIsDirective(),
248 // denoting the actual preprocessor directive to be processed.
249 //
250 // If the preprocessing directive disables the tokens processing, e.g.:
251 // #ifdef NAME // NAME is undefined
252 // then lexPreprocessor() enters the lines-skipping mode.
253 // In this mode, it does not parse any tokens, because the code under
254 // the #ifdef may not even be a correct tablegen code. The preprocessor
255 // looks for lines containing other preprocessing directives, which
256 // may be prepended with whitespaces and C-style comments. If the line
257 // does not contain a preprocessing directive, it is skipped completely.
258 // Otherwise, the preprocessing directive is processed by recursively
259 // calling lexPreprocessor(). The processing of the encountered
260 // preprocessing directives includes updating preprocessing control stack
261 // and adding new macros into DefinedMacros set.
262 //
263 // The second parameter controls whether lexPreprocessor() is called from
264 // LexToken() (true) or recursively from lexPreprocessor() (false).
265 //
266 // If ReturnNextLiveToken is true, the method returns the next
267 // LEX token following the current directive or following the end
268 // of the disabled preprocessing region corresponding to this directive.
269 // If ReturnNextLiveToken is false, the method returns the first parameter,
270 // unless there were errors encountered in the disabled preprocessing
271 // region - in this case, it returns tgtok::Error.
272 tgtok::TokKind lexPreprocessor(tgtok::TokKind Kind,
273 bool ReturnNextLiveToken = true);
274
275 // Worker method for lexPreprocessor() to skip lines after some
276 // preprocessing directive up to the buffer end or to the directive
277 // that re-enables token processing. The method returns true
278 // upon processing the next directive that re-enables tokens
279 // processing. False is returned if an error was encountered.
280 //
281 // Note that prepSkipRegion() calls lexPreprocessor() to process
282 // encountered preprocessing directives. In this case, the second
283 // parameter to lexPreprocessor() is set to false. Being passed
284 // false ReturnNextLiveToken, lexPreprocessor() must never call
285 // prepSkipRegion(). We assert this by passing ReturnNextLiveToken
286 // to prepSkipRegion() and checking that it is never set to false.
287 bool prepSkipRegion(bool MustNeverBeFalse);
288
289 // Lex name of the macro after either #ifdef or #define. We could have used
290 // LexIdentifier(), but it has special handling of "include" word, which
291 // could result in awkward diagnostic errors. Consider:
292 // ----
293 // #ifdef include
294 // class ...
295 // ----
296 // LexIdentifier() will engage LexInclude(), which will complain about
297 // missing file with name "class". Instead, prepLexMacroName() will treat
298 // "include" as a normal macro name.
299 //
300 // On entry, CurPtr points to the end of a preprocessing directive word.
301 // The method allows for whitespaces between the preprocessing directive
302 // and the macro name. The allowed whitespaces are ' ' and '\t'.
303 //
304 // If the first non-whitespace symbol after the preprocessing directive
305 // is a valid start symbol for an identifier (i.e. [a-zA-Z_]), then
306 // the method updates TokStart to the position of the first non-whitespace
307 // symbol, sets CurPtr to the position of the macro name's last symbol,
308 // and returns a string reference to the macro name. Otherwise,
309 // TokStart is set to the first non-whitespace symbol after the preprocessing
310 // directive, and the method returns an empty string reference.
311 //
312 // In all cases, TokStart may be used to point to the word following
313 // the preprocessing directive.
314 StringRef prepLexMacroName();
315
316 // Skip any whitespaces starting from CurPtr. The method is used
317 // only in the lines-skipping mode to find the first non-whitespace
318 // symbol after or at CurPtr. Allowed whitespaces are ' ', '\t', '\n'
319 // and '\r'. The method skips C-style comments as well, because
320 // it is used to find the beginning of the preprocessing directive.
321 // If we do not handle C-style comments the following code would
322 // result in incorrect detection of a preprocessing directive:
323 // /*
324 // #ifdef NAME
325 // */
326 // As long as we skip C-style comments, the following code is correctly
327 // recognized as a preprocessing directive:
328 // /* first line comment
329 // second line comment */ #ifdef NAME
330 //
331 // The method returns true upon reaching the first non-whitespace symbol
332 // or EOF, CurPtr is set to point to this symbol. The method returns false,
333 // if an error occured during skipping of a C-style comment.
334 bool prepSkipLineBegin();
335
336 // Skip any whitespaces or comments after a preprocessing directive.
337 // The method returns true upon reaching either end of the line
338 // or end of the file. If there is a multiline C-style comment
339 // after the preprocessing directive, the method skips
340 // the comment, so the final CurPtr may point to one of the next lines.
341 // The method returns false, if an error occured during skipping
342 // C- or C++-style comment, or a non-whitespace symbol appears
343 // after the preprocessing directive.
344 //
345 // The method maybe called both during lines-skipping and tokens
346 // processing. It actually verifies that only whitespaces or/and
347 // comments follow a preprocessing directive.
348 //
349 // After the execution of this mehod, CurPtr points either to new line
350 // symbol, buffer end or non-whitespace symbol following the preprocesing
351 // directive.
352 bool prepSkipDirectiveEnd();
353
354 // Skip all symbols to the end of the line/file.
355 // The method adjusts CurPtr, so that it points to either new line
356 // symbol in the current line or the buffer end.
357 void prepSkipToLineEnd();
358
359 // Return true, if the current preprocessor control stack is such that
360 // we should allow lexer to process the next token, false - otherwise.
361 //
362 // In particular, the method returns true, if all the #ifdef/#else
363 // controls on the stack have their IsDefined member set to true.
364 bool prepIsProcessingEnabled();
365
366 // Report an error, if we reach EOF with non-empty preprocessing control
367 // stack. This means there is no matching #endif for the previous
368 // #ifdef/#else.
369 void prepReportPreprocessorStackError();
Chris Lattnera8058742007-11-18 02:57:27 +0000370};
Nicolai Haehnle9ae21b32018-03-09 18:32:04 +0000371
Chris Lattnera8058742007-11-18 02:57:27 +0000372} // end namespace llvm
373
374#endif