Blame - tools/llvm-rc/ResourceScriptToken.cpp - platform_external_llvm80

blob: fc0a0e91845076e672074e31ba2586d6cee7eacd [file] [log] [blame]

Marek Sokolowski	6c9cbed	2017-08-10 16:21:44 +0000	[diff] [blame]	1	//===-- ResourceScriptToken.cpp ---------------------------------- C++--===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===---------------------------------------------------------------------===//
				9	//
				10	// This file implements an interface defined in ResourceScriptToken.h.
				11	// In particular, it defines an .rc script tokenizer.
				12	//
				13	//===---------------------------------------------------------------------===//
				14
				15	#include "ResourceScriptToken.h"
				16	#include "llvm/Support/raw_ostream.h"
				17
				18	#include <algorithm>
				19	#include <cassert>
				20	#include <cctype>
				21	#include <cstdlib>
				22	#include <utility>
				23
				24	using namespace llvm;
				25
				26	using Kind = RCToken::Kind;
				27
				28	// Checks if Representation is a correct description of an RC integer.
				29	// It should be a 32-bit unsigned integer, either decimal, octal (0[0-7]+),
				30	// or hexadecimal (0x[0-9a-f]+). It might be followed by a single 'L'
				31	// character (that is the difference between our representation and
				32	// StringRef's one). If Representation is correct, 'true' is returned and
				33	// the return value is put back in Num.
				34	static bool rcGetAsInteger(StringRef Representation, uint32_t &Num) {
				35	size_t Length = Representation.size();
				36	if (Length == 0)
				37	return false;
				38	// Strip the last 'L' if unnecessary.
				39	if (std::toupper(Representation.back()) == 'L')
				40	Representation = Representation.drop_back(1);
				41
				42	return !Representation.getAsInteger<uint32_t>(0, Num);
				43	}
				44
				45	RCToken::RCToken(RCToken::Kind RCTokenKind, StringRef Value)
				46	: TokenKind(RCTokenKind), TokenValue(Value) {}
				47
				48	uint32_t RCToken::intValue() const {
				49	assert(TokenKind == Kind::Int);
				50	// We assume that the token already is a correct integer (checked by
				51	// rcGetAsInteger).
				52	uint32_t Result;
				53	bool IsSuccess = rcGetAsInteger(TokenValue, Result);
				54	assert(IsSuccess);
				55	(void)IsSuccess; // Silence the compiler warning when -DNDEBUG flag is on.
				56	return Result;
				57	}
				58
Zachary Turner	93bb30d	2017-10-06 21:26:06 +0000	[diff] [blame]	59	bool RCToken::isLongInt() const {
				60	return TokenKind == Kind::Int && std::toupper(TokenValue.back()) == 'L';
				61	}
				62
Marek Sokolowski	6c9cbed	2017-08-10 16:21:44 +0000	[diff] [blame]	63	StringRef RCToken::value() const { return TokenValue; }
				64
				65	Kind RCToken::kind() const { return TokenKind; }
				66
Marek Sokolowski	5906648	2017-09-28 23:53:25 +0000	[diff] [blame]	67	bool RCToken::isBinaryOp() const {
				68	switch (TokenKind) {
				69	case Kind::Plus:
				70	case Kind::Minus:
				71	case Kind::Pipe:
				72	case Kind::Amp:
				73	return true;
				74	default:
				75	return false;
				76	}
				77	}
				78
Marek Sokolowski	6c9cbed	2017-08-10 16:21:44 +0000	[diff] [blame]	79	static Error getStringError(const Twine &message) {
				80	return make_error<StringError>("Error parsing file: " + message,
				81	inconvertibleErrorCode());
				82	}
				83
				84	namespace {
				85
				86	class Tokenizer {
				87	public:
				88	Tokenizer(StringRef Input) : Data(Input), DataLength(Input.size()) {}
				89
				90	Expected<std::vector<RCToken>> run();
				91
				92	private:
				93	// All 'advancing' methods return boolean values; if they're equal to false,
				94	// the stream has ended or failed.
				95	bool advance(size_t Amount = 1);
				96	bool skipWhitespaces();
				97
				98	// Consumes a token. If any problem occurred, a non-empty Error is returned.
				99	Error consumeToken(const Kind TokenKind);
				100
				101	// Check if tokenizer is about to read FollowingChars.
				102	bool willNowRead(StringRef FollowingChars) const;
				103
				104	// Check if tokenizer can start reading an identifier at current position.
				105	// The original tool did non specify the rules to determine what is a correct
				106	// identifier. We assume they should follow the C convention:
Benjamin Kramer	be4da53	2017-09-07 09:54:03 +0000	[diff] [blame]	107	// [a-zA-Z_][a-zA-Z0-9_]*.
Marek Sokolowski	6c9cbed	2017-08-10 16:21:44 +0000	[diff] [blame]	108	bool canStartIdentifier() const;
				109	// Check if tokenizer can continue reading an identifier.
				110	bool canContinueIdentifier() const;
				111
				112	// Check if tokenizer can start reading an integer.
				113	// A correct integer always starts with a 0-9 digit,
				114	// can contain characters 0-9A-Fa-f (digits),
				115	// Ll (marking the integer is 32-bit), Xx (marking the representation
				116	// is hexadecimal). As some kind of separator should come after the
				117	// integer, we can consume the integer until a non-alphanumeric
				118	// character.
				119	bool canStartInt() const;
				120	bool canContinueInt() const;
				121
				122	bool canStartString() const;
				123
Zachary Turner	ed3baee	2017-10-09 15:46:13 +0000	[diff] [blame]	124	// Check if tokenizer can start reading a single line comment (e.g. a comment
				125	// that begins with '//')
				126	bool canStartLineComment() const;
				127
				128	// Check if tokenizer can start or finish reading a block comment (e.g. a
				129	// comment that begins with '/' and ends with '/')
				130	bool canStartBlockComment() const;
				131
				132	// Throw away all remaining characters on the current line.
				133	void skipCurrentLine();
				134
Marek Sokolowski	6c9cbed	2017-08-10 16:21:44 +0000	[diff] [blame]	135	bool streamEof() const;
				136
				137	// Classify the token that is about to be read from the current position.
				138	Kind classifyCurrentToken() const;
				139
				140	// Process the Kind::Identifier token - check if it is
				141	// an identifier describing a block start or end.
				142	void processIdentifier(RCToken &token) const;
				143
				144	StringRef Data;
				145	size_t DataLength, Pos;
				146	};
				147
Zachary Turner	ed3baee	2017-10-09 15:46:13 +0000	[diff] [blame]	148	void Tokenizer::skipCurrentLine() {
				149	Pos = Data.find_first_of("\r\n", Pos);
				150	Pos = Data.find_first_not_of("\r\n", Pos);
				151
				152	if (Pos == StringRef::npos)
				153	Pos = DataLength;
				154	}
				155
Marek Sokolowski	6c9cbed	2017-08-10 16:21:44 +0000	[diff] [blame]	156	Expected<std::vector<RCToken>> Tokenizer::run() {
				157	Pos = 0;
				158	std::vector<RCToken> Result;
				159
				160	// Consume an optional UTF-8 Byte Order Mark.
				161	if (willNowRead("\xef\xbb\xbf"))
				162	advance(3);
				163
				164	while (!streamEof()) {
				165	if (!skipWhitespaces())
				166	break;
				167
				168	Kind TokenKind = classifyCurrentToken();
				169	if (TokenKind == Kind::Invalid)
				170	return getStringError("Invalid token found at position " + Twine(Pos));
				171
				172	const size_t TokenStart = Pos;
				173	if (Error TokenError = consumeToken(TokenKind))
				174	return std::move(TokenError);
				175
Zachary Turner	ed3baee	2017-10-09 15:46:13 +0000	[diff] [blame]	176	// Comments are just deleted, don't bother saving them.
				177	if (TokenKind == Kind::LineComment \|\| TokenKind == Kind::StartComment)
				178	continue;
				179
Marek Sokolowski	6c9cbed	2017-08-10 16:21:44 +0000	[diff] [blame]	180	RCToken Token(TokenKind, Data.take_front(Pos).drop_front(TokenStart));
				181	if (TokenKind == Kind::Identifier) {
				182	processIdentifier(Token);
				183	} else if (TokenKind == Kind::Int) {
				184	uint32_t TokenInt;
				185	if (!rcGetAsInteger(Token.value(), TokenInt)) {
				186	// The integer has incorrect format or cannot be represented in
				187	// a 32-bit integer.
				188	return getStringError("Integer invalid or too large: " +
				189	Token.value().str());
				190	}
				191	}
				192
				193	Result.push_back(Token);
				194	}
				195
				196	return Result;
				197	}
				198
				199	bool Tokenizer::advance(size_t Amount) {
				200	Pos += Amount;
				201	return !streamEof();
				202	}
				203
				204	bool Tokenizer::skipWhitespaces() {
				205	while (!streamEof() && std::isspace(Data[Pos]))
				206	advance();
				207	return !streamEof();
				208	}
				209
				210	Error Tokenizer::consumeToken(const Kind TokenKind) {
				211	switch (TokenKind) {
				212	// One-character token consumption.
				213	#define TOKEN(Name)
				214	#define SHORT_TOKEN(Name, Ch) case Kind::Name:
David Blaikie	bb5acf9	2017-11-21 00:23:19 +0000	[diff] [blame]	215	#include "ResourceScriptTokenList.def"
Marek Sokolowski	6c9cbed	2017-08-10 16:21:44 +0000	[diff] [blame]	216	advance();
				217	return Error::success();
				218
Zachary Turner	ed3baee	2017-10-09 15:46:13 +0000	[diff] [blame]	219	case Kind::LineComment:
				220	advance(2);
				221	skipCurrentLine();
				222	return Error::success();
				223
				224	case Kind::StartComment: {
				225	advance(2);
				226	auto EndPos = Data.find("*/", Pos);
				227	if (EndPos == StringRef::npos)
				228	return getStringError(
				229	"Unclosed multi-line comment beginning at position " + Twine(Pos));
				230	advance(EndPos - Pos);
				231	advance(2);
				232	return Error::success();
				233	}
Marek Sokolowski	6c9cbed	2017-08-10 16:21:44 +0000	[diff] [blame]	234	case Kind::Identifier:
				235	while (!streamEof() && canContinueIdentifier())
				236	advance();
				237	return Error::success();
				238
				239	case Kind::Int:
				240	while (!streamEof() && canContinueInt())
				241	advance();
				242	return Error::success();
				243
				244	case Kind::String:
				245	// Consume the preceding 'L', if there is any.
				246	if (std::toupper(Data[Pos]) == 'L')
				247	advance();
				248	// Consume the double-quote.
				249	advance();
				250
				251	// Consume the characters until the end of the file, line or string.
				252	while (true) {
				253	if (streamEof()) {
				254	return getStringError("Unterminated string literal.");
				255	} else if (Data[Pos] == '"') {
				256	// Consume the ending double-quote.
				257	advance();
Zachary Turner	b231046	2017-10-06 22:05:15 +0000	[diff] [blame]	258	// However, if another '"' follows this double-quote, the string didn't
				259	// end and we just included '"' into the string.
				260	if (!willNowRead("\""))
				261	return Error::success();
Marek Sokolowski	6c9cbed	2017-08-10 16:21:44 +0000	[diff] [blame]	262	} else if (Data[Pos] == '\n') {
				263	return getStringError("String literal not terminated in the line.");
				264	}
				265
				266	advance();
				267	}
				268
				269	case Kind::Invalid:
				270	assert(false && "Cannot consume an invalid token.");
				271	}
Marek Sokolowski	82056f0	2017-08-10 16:46:52 +0000	[diff] [blame]	272
Simon Pilgrim	92e2e47	2017-08-10 17:20:09 +0000	[diff] [blame]	273	llvm_unreachable("Unknown RCToken::Kind");
Marek Sokolowski	6c9cbed	2017-08-10 16:21:44 +0000	[diff] [blame]	274	}
				275
				276	bool Tokenizer::willNowRead(StringRef FollowingChars) const {
				277	return Data.drop_front(Pos).startswith(FollowingChars);
				278	}
				279
				280	bool Tokenizer::canStartIdentifier() const {
				281	assert(!streamEof());
				282
				283	const char CurChar = Data[Pos];
Martin Storsjo	76e573d	2018-05-08 08:47:37 +0000	[diff] [blame]	284	return std::isalpha(CurChar) \|\| CurChar == '_' \|\| CurChar == '.';
Marek Sokolowski	6c9cbed	2017-08-10 16:21:44 +0000	[diff] [blame]	285	}
				286
				287	bool Tokenizer::canContinueIdentifier() const {
				288	assert(!streamEof());
				289	const char CurChar = Data[Pos];
Martin Storsjo	76e573d	2018-05-08 08:47:37 +0000	[diff] [blame]	290	return std::isalnum(CurChar) \|\| CurChar == '_' \|\| CurChar == '.' \|\|
				291	CurChar == '/' \|\| CurChar == '\\';
Marek Sokolowski	6c9cbed	2017-08-10 16:21:44 +0000	[diff] [blame]	292	}
				293
				294	bool Tokenizer::canStartInt() const {
				295	assert(!streamEof());
				296	return std::isdigit(Data[Pos]);
				297	}
				298
Zachary Turner	ed3baee	2017-10-09 15:46:13 +0000	[diff] [blame]	299	bool Tokenizer::canStartBlockComment() const {
				300	assert(!streamEof());
				301	return Data.drop_front(Pos).startswith("/*");
				302	}
				303
				304	bool Tokenizer::canStartLineComment() const {
				305	assert(!streamEof());
				306	return Data.drop_front(Pos).startswith("//");
				307	}
				308
Marek Sokolowski	6c9cbed	2017-08-10 16:21:44 +0000	[diff] [blame]	309	bool Tokenizer::canContinueInt() const {
				310	assert(!streamEof());
				311	return std::isalnum(Data[Pos]);
				312	}
				313
				314	bool Tokenizer::canStartString() const {
				315	return willNowRead("\"") \|\| willNowRead("L\"") \|\| willNowRead("l\"");
				316	}
				317
				318	bool Tokenizer::streamEof() const { return Pos == DataLength; }
				319
				320	Kind Tokenizer::classifyCurrentToken() const {
Zachary Turner	ed3baee	2017-10-09 15:46:13 +0000	[diff] [blame]	321	if (canStartBlockComment())
				322	return Kind::StartComment;
				323	if (canStartLineComment())
				324	return Kind::LineComment;
				325
Marek Sokolowski	6c9cbed	2017-08-10 16:21:44 +0000	[diff] [blame]	326	if (canStartInt())
				327	return Kind::Int;
				328	if (canStartString())
				329	return Kind::String;
				330	// BEGIN and END are at this point of lexing recognized as identifiers.
				331	if (canStartIdentifier())
				332	return Kind::Identifier;
				333
				334	const char CurChar = Data[Pos];
				335
				336	switch (CurChar) {
				337	// One-character token classification.
				338	#define TOKEN(Name)
				339	#define SHORT_TOKEN(Name, Ch) \
				340	case Ch: \
				341	return Kind::Name;
David Blaikie	bb5acf9	2017-11-21 00:23:19 +0000	[diff] [blame]	342	#include "ResourceScriptTokenList.def"
Marek Sokolowski	6c9cbed	2017-08-10 16:21:44 +0000	[diff] [blame]	343
				344	default:
				345	return Kind::Invalid;
				346	}
				347	}
				348
				349	void Tokenizer::processIdentifier(RCToken &Token) const {
				350	assert(Token.kind() == Kind::Identifier);
				351	StringRef Name = Token.value();
				352
				353	if (Name.equals_lower("begin"))
				354	Token = RCToken(Kind::BlockBegin, Name);
				355	else if (Name.equals_lower("end"))
				356	Token = RCToken(Kind::BlockEnd, Name);
				357	}
				358
				359	} // anonymous namespace
				360
				361	namespace llvm {
				362
				363	Expected<std::vector<RCToken>> tokenizeRC(StringRef Input) {
				364	return Tokenizer(Input).run();
				365	}
				366
				367	} // namespace llvm