#include "ugly_macros.h"
switch (HashStr(token))
{
case ID('f','o','r'):
break;
case ID('e','l','s','e'):
break;
case ID('e','n','u','m'):
break;
default:
break;
}
/* helper, truncate to 20 args */
#define _GET_20_ARGS( \
_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, \
... ) \
_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19
/* results in up to 20 arguments, padded with null characters
* requires at least 1 argument */
#define GET_20_ARGS(...) \
_GET_20_ARGS( __VA_ARGS__, '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', \
'\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0' )
/* user interface */
#define ID(...) CompileStrHasher<GET_20_ARGS(__VA_ARGS__)>::value
/* hash function (FNV in reverse) */
template< char _0, char _1, char _2, char _3, char _4, char _5, char _6, char _7, char _8, char _9,
char _10, char _11, char _12, char _13, char _14, char _15, char _16, char _17, char _18, char _19 >
struct CompileStrHasher
{
enum
{
value = (_0 ^ ID(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19)) * 16777619u
};
};
/* seed value */
template<>
struct CompileStrHasher<GET_20_ARGS('\0')>
{
enum
{
value = 2166136261u
};
};
// example usage
#if __cplusplus < 201103L
// flat solution (run-time execution)
unsigned HashStr(const char* str)
{
unsigned value = 2166136261u;
if (!*str)
return value;
const char* end = str;
for (int i = 0; *(end + 1) && i < 20; ++end, ++i)
;
do
{
value = (*end ^ value) * 16777619u;
} while (str != (end--));
return value;
}
#else
// recursive C++11 constexpr (compile-time execution)
constexpr unsigned HashStr(const char* str, int depth = 0)
{
return (*str && depth < 20) ? (*str ^ HashStr(str+1, depth+1)) * 16777619u : 2166136261u;
}
#endif
int main()
{
switch (HashStr("str"))
{
case ID('s','t','r'):
return 0;
default:
break;
}
return 1;
}
#include <cstring>
unsigned HashStr(const char* str)
{
unsigned value = 2166136261u;
switch (strlen(str))
{
default:
case 20: value = (str[19] ^ value) * 16777619u;
case 19: value = (str[18] ^ value) * 16777619u;
case 18: value = (str[17] ^ value) * 16777619u;
case 17: value = (str[16] ^ value) * 16777619u;
case 16: value = (str[15] ^ value) * 16777619u;
case 15: value = (str[14] ^ value) * 16777619u;
case 14: value = (str[13] ^ value) * 16777619u;
case 13: value = (str[12] ^ value) * 16777619u;
case 12: value = (str[11] ^ value) * 16777619u;
case 11: value = (str[10] ^ value) * 16777619u;
case 10: value = (str[ 9] ^ value) * 16777619u;
case 9: value = (str[ 8] ^ value) * 16777619u;
case 8: value = (str[ 7] ^ value) * 16777619u;
case 7: value = (str[ 6] ^ value) * 16777619u;
case 6: value = (str[ 5] ^ value) * 16777619u;
case 5: value = (str[ 4] ^ value) * 16777619u;
case 4: value = (str[ 3] ^ value) * 16777619u;
case 3: value = (str[ 2] ^ value) * 16777619u;
case 2: value = (str[ 1] ^ value) * 16777619u;
case 1: value = (str[ 0] ^ value) * 16777619u;
case 0:
return value;
}
}
Then how would you handle hash collisions?If the performance from this conversion is significant, and there are no collisions with the list of built in keywords (from testing on a large selection of code, e.g. STL, Boost, wxWidgets, C::B, Linux kernel), I would argue do not handle it. Why build purposefully wrong code? Because the hiccups it might cause would be rare in comparison to the rest of the fallacies of the parser (but only worth the rare chance if speed up is significant).
[...] I have both C preprocessor grammar for both Quex - A Fast Universal Lexical Analyzer Generator (http://quex.sourceforge.net/) and re2c scanner generator | SourceForge.net (http://sourceforge.net/projects/re2c/), they can generate very fast lexer, which not only get a lexeme string (like what currently we do in Tokenizer::GetToken()), but also an ID value. So, finally we can compare all the keyword tokens by IDs, not the literal strings. But using them will change our CC's code a lot :(This would be ideal, but a significant job to rewrite. I personally like Ragel (http://www.complang.org/ragel/), since mixing it in code feels cleaner/more flexible (and it says it has full DFA minimization, whereas re2c states it cannot yet do that; I do not have much experience with Quex). Do you have a link to your C preprocessor grammars? I am curious what they look like.
Hi, Alpha.[...] I have both C preprocessor grammar for both Quex - A Fast Universal Lexical Analyzer Generator (http://quex.sourceforge.net/) and re2c scanner generator | SourceForge.net (http://sourceforge.net/projects/re2c/), they can generate very fast lexer, which not only get a lexeme string (like what currently we do in Tokenizer::GetToken()), but also an ID value. So, finally we can compare all the keyword tokens by IDs, not the literal strings. But using them will change our CC's code a lot :(This would be ideal, but a significant job to rewrite. I personally like Ragel (http://www.complang.org/ragel/), since mixing it in code feels cleaner/more flexible (and it says it has full DFA minimization, whereas re2c states it cannot yet do that; I do not have much experience with Quex). Do you have a link to your C preprocessor grammars? I am curious what they look like.
This release adds a very important step in the process of code generation: minimization of the underlying DFA (deterministic finite automaton). Simply speaking, this means that re2c now generates less code (while the generated code behaves in exactly the same way).
DFA minimization is a very well-known technique and one might expect that any self-respecting lexer generator would definitely use it. So how could it be that re2c didn't? In fact, re2c did use a couple of self-invented tricks to compress the generated code (one interesting technique is constructing tunnel automaton). Some of these tricks were quite buggy (see this bug report for example). Now that re2c does canonical DFA minimization all this stuff is obsolete and has been dropped.
...