From:	CSBVAX::MRGATE!schmidt@bonnie.ics.uci.edu@SMTP  9-AUG-1988 23:14
To:	ARISIA::EVERHART
Subj:	Minimum perfect hash function for gcc 1.25 reserved words.


Received: from rome.ics.uci.edu by prep.ai.mit.edu; Mon, 8 Aug 88 14:40:17 EST
Received: from ics.uci.edu by ROME.ICS.UCI.EDU id aa19109; 8 Aug 88 11:38 PDT
Received: by ICS.UCI.EDU id ao24043; 8 Aug 88 11:37 PDT
Received: from bonnie.ics.uci.edu by ICS.UCI.EDU id aa06567; 6 Aug 88 0:09 PDT
To: info-gcc <@beanie.ICS.UCI.EDU:info-gcc@prep.ai.mit.edu>
Subject: Minimum perfect hash function for gcc 1.25 reserved words.
Date: Sat, 06 Aug 88 00:04:18 -0700
From: "Douglas C. Schmidt" <schmidt@bonnie.ics.uci.edu>
Message-Id:  <8808060009.aa06567@ICS.UCI.EDU>


Howdy,

While preparing for a data structures course I'm currently teaching at
University of California, Irvine, I happened upon a program that
generates minimum perfect hash functions for sets of reserved words.
The code below contains diffs for the parse.y file.  This code
implements a minimum perfect hash function for the 36 gcc reserved
words.  For those of a less theoretical bent, a minimum perfect hash
function is simply: 

   1. a hash function and and a data structure that allows
      recognition of a key word in a set using 1 probe into the data
      structure.  This is the "perfect" part.

   2. the data structure holding the reserved words must be precisely
      large enough for the set of words, and no larger.  This is the
      "minimum" part (which I've relaxed just a tad; purists will note
      that this relaxation is unnecessary if an offset is subtracted from
      the hash value).

Some informal empirical tests have shown that this implementation is
somewhat faster overall than the "original" gcc method (which uses a
hash scheme based on the length of the identifer), especially when the
files are large and the number of identifiers with 5 and 6 character
lengths are high (this is the worst case scenario for the current gcc
scheme, since up to 10 comparison may be performed for 6 character
identifiers).  Of course, for many input files the difference is
insignificant, since the running time of the current method is highly
dependent on the input characteristics.  Regarding the latter point,
the biggest practical and theoretical win with the minimum perfect
hash function method is the fact that there is simply no "worst case"
*identifier* input that causes the lexical analyzer to perform any
worse than "usual."

I've produced two versions of the minimum perfect hash functions: the
current one, which is easier to understand and maintain, and another
version, which is less coherent, but smaller and faster.  I'll post
the latter version in a subsequent message.  I'd be interested in
hearing from anyone who installs and tests this out (any improvements
would be welcome as well!).  Don't expect miracles, since reserved
word recognition is certainly not the primary determinant of compiler
execution speed.  As I said before, the main advantage is the
consistent lexical analysis performance and the elimination of worst
case inputs.

Finally, for anyone who is interested, I'll be happy to send out
copies of the minimum hash function *generator* program as well.  
Those of you working on compilers or any program that requires a
key word lookup routine may find this *extremely* useful for speeding
up important functions.  My program derived the minimum perfect hash
function for gcc reserved words in about 3 seconds!

Hope you enjoy this!

Doug Schmidt

------------------------------( Cut Here )------------------------------

*** parse.y.old	Fri Aug  5 23:26:21 1988
--- parse.y	Fri Aug  5 23:17:09 1988
***************
*** 1219,1281 ****
  static char *token_buffer;	/* Pointer to token buffer.
  				   Actual allocated length is maxtoken + 2.  */
  
! #define MAXRESERVED 9
  
- /* frw[I] is index in `reswords' of the first word whose length is I;
-    frw[I+1] is one plus the index of the last word whose length is I.
-    The length of frw must be MAXRESERVED + 2 so there is an element
-    at MAXRESERVED+1 for the case I == MAXRESERVED.  */
- 
- static char frw[MAXRESERVED+2] =
-   { 0, 0, 0, 2, 5, 13, 19, 29, 31, 35, 36 };
- 
- /* Table of reserved words.  */
- 
- struct resword { char *name; short token; enum rid rid;};
- 
  #define NORID RID_UNUSED
  
! static struct resword reswords[]
!   = {{"if", IF, NORID},
!      {"do", DO, NORID},
!      {"int", TYPESPEC, RID_INT},
!      {"for", FOR, NORID},
!      {"asm", ASM, NORID},
!      {"case", CASE, NORID},
!      {"char", TYPESPEC, RID_CHAR},
!      {"auto", SCSPEC, RID_AUTO},
!      {"goto", GOTO, NORID},
       {"else", ELSE, NORID},
-      {"long", TYPESPEC, RID_LONG},
-      {"void", TYPESPEC, RID_VOID},
       {"enum", ENUM, NORID},
-      {"float", TYPESPEC, RID_FLOAT},
-      {"short", TYPESPEC, RID_SHORT},
-      {"union", UNION, NORID},
-      {"break", BREAK, NORID},
       {"while", WHILE, NORID},
-      {"const", TYPE_QUAL, RID_CONST},
-      {"double", TYPESPEC, RID_DOUBLE},
-      {"static", SCSPEC, RID_STATIC},
       {"extern", SCSPEC, RID_EXTERN},
       {"struct", STRUCT, NORID},
       {"return", RETURN, NORID},
!      {"sizeof", SIZEOF, NORID},
       {"typeof", TYPEOF, NORID},
       {"switch", SWITCH, NORID},
!      {"signed", TYPESPEC, RID_SIGNED},
!      {"inline", SCSPEC, RID_INLINE},
!      {"typedef", SCSPEC, RID_TYPEDEF},
!      {"default", DEFAULT, NORID},
!      {"unsigned", TYPESPEC, RID_UNSIGNED},
       {"continue", CONTINUE, NORID},
!      {"register", SCSPEC, RID_REGISTER},
!      {"volatile", TYPE_QUAL, RID_VOLATILE},
!      {"__alignof", ALIGNOF, NORID}};
  
  /* The elements of `ridpointers' are identifier nodes
!    for the reserved type names and storage classes.
!    It is indexed by a RID_... value.  */
  
  tree ridpointers[(int) RID_MAX];
  
--- 1219,1343 ----
  static char *token_buffer;	/* Pointer to token buffer.
  				   Actual allocated length is maxtoken + 2.  */
  
! #define MIN_WORD_SIZE       2      /* minimum size for C reserved words */
! #define MAX_WORD_SIZE       9      /* maximum size for C reserved words */
! #define MIN_KEY_SIZE        4      /* range of the hash keys for the */
! #define MAX_KEY_SIZE        39     /* minimum perfect hash generator */
  
  #define NORID RID_UNUSED
  
! struct resword {char *name; short token; enum rid rid;};
! 
! static struct resword reswords[] 
!   = {{NULL, 0, NORID},            
!      {NULL, 0, NORID}, /* these locations are not used. */
!      {NULL, 0, NORID}, /* they simplify the hashing.    */
!      {NULL, 0, NORID},
       {"else", ELSE, NORID},
       {"enum", ENUM, NORID},
       {"while", WHILE, NORID},
       {"extern", SCSPEC, RID_EXTERN},
+      {"double", TYPESPEC, RID_DOUBLE},
+      {"default", DEFAULT, NORID},
+      {"do", DO, NORID},
+      {"goto", GOTO, NORID},
+      {"short", TYPESPEC, RID_SHORT},        
       {"struct", STRUCT, NORID},
       {"return", RETURN, NORID},
!      {"signed", TYPESPEC, RID_SIGNED},       
!      {"float", TYPESPEC, RID_FLOAT},        
       {"typeof", TYPEOF, NORID},
+      {"typedef", SCSPEC, RID_TYPEDEF},        
       {"switch", SWITCH, NORID},
!      {"int", TYPESPEC, RID_INT},          
!      {"for", FOR, NORID},
!      {"register", SCSPEC, RID_REGISTER},       
!      {"inline", SCSPEC, RID_INLINE},         
!      {"sizeof", SIZEOF, NORID},
!      {"void", TYPESPEC, RID_VOID},         
!      {"__alignof", ALIGNOF, NORID},
!      {"volatile", TYPE_QUAL, RID_VOLATILE},    
!      {"case", CASE, NORID},
!      {"const", TYPE_QUAL, RID_CONST},       
!      {"if", IF, NORID},
!      {"long", TYPESPEC, RID_LONG},         
       {"continue", CONTINUE, NORID},
!      {"asm", ASM, NORID},
!      {"union", UNION, NORID},
!      {"char", TYPESPEC, RID_CHAR},         
!      {"break", BREAK, NORID},               
!      {"static", SCSPEC, RID_STATIC},         
!      {"unsigned", TYPESPEC, RID_UNSIGNED},     
!      {"auto", SCSPEC, RID_AUTO}};
  
+ /* This function performs the minimum-perfect hash mapping from input
+    string to reswords table index.  It only looks at the first and 
+    last characters in the string, thus assuring the O(1) lookup time 
+    (this keeps our constant down to an insignificant amount!).  Compiling
+    the following 2 functions as inline removes all overhead of the
+    function calls. */
+ 
+ #ifdef __GNUC__
+ inline static int hash (char *str,int len)
+ #else
+ static int hash (str,len)
+ register char *str;             
+ register int len;
+ #endif
+ {
+   /* This table is used to build the hash table index that recognizes
+      reserved words in 0(1) steps.  It is larger than strictly necessary, 
+      but I'm trading off the space for the time-saving luxury of avoiding 
+      subtraction of an offset. */
+ 
+   static char hash_table[]
+     = {0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+        0,  0,  0,  0,  0,  6,  0, 29, 31, 24,
+        2,  0, 11,  1,  6, 17,  0,  0, 26,  1,
+        1,  6,  0,  0,  7,  7,  0, 28, 19,  1,
+        0,  0,  0,  0,  0,  0,  0,  0};
+ 
+   /* The hash function couldn't be simpler: add the length of the string
+      to the Hash_Table value of its first and last character. */
+ 
+   return len + hash_table[ (int) str[0]] + hash_table[ (int) str[len - 1]];
+ }
+ 
+ /* This routine attempts to match the string found in the Word_Table with
+    the one from the input stream.  If all the relevant details match an
+    actual strcmp comparison is performed.  */
+ 
+ #ifdef __GNUC__
+ inline struct resword *is_reserved_word (char *str,int len)
+ #else
+ struct resword *is_reserved_word (str,len)
+ register char *str;
+ register int len;
+ #endif
+ {
+   if (len <= MAX_WORD_SIZE && len >= MIN_WORD_SIZE) 
+     {
+       register int key = hash (str,len);
+ 
+       if (key >= MIN_KEY_SIZE && key <= MAX_KEY_SIZE)
+ 	if (reswords[key].name[0] == str[0]
+ 	    && !strcmp (reswords[key].name + 1,str + 1))
+ 	  return &reswords[key];
+     }
+   return NULL;   
+ }
+ 
  /* The elements of `ridpointers' are identifier nodes
!  for the reserved type names and storage classes.
!  It is indexed by a RID_... value.  */
  
  tree ridpointers[(int) RID_MAX];
  
***************
*** 1810,1844 ****
        value = IDENTIFIER;
        yylval.itype = 0;
  
!       /* Try to recognize a keyword.  */
  
!       if (p - token_buffer <= MAXRESERVED)
! 	{
! 	  register int lim = frw [p - token_buffer + 1];
! 	  register int i = frw[p - token_buffer];
! 	  register struct resword *p = &reswords[i];
! 
! 	  for (; i < lim; i++, p++)
! 	    if (p->name[0] == token_buffer[0]
! 		&& !strcmp (p->name, token_buffer))
! 	      {
! 		if (p->rid)
! 		  yylval.ttype = ridpointers[(int) p->rid];
! 		if ((! flag_no_asm
! 		     || ((int) p->token != ASM
! 			 && (int) p->token != TYPEOF
! 			 && strcmp (p->name, "inline")))
! 		    /* -ftraditional means don't recognize
! 		       typeof, const, volatile, signed or inline.  */
! 		    && (! flag_traditional
! 			|| ((int) p->token != TYPE_QUAL
! 			    && (int) p->token != TYPEOF
! 			    && strcmp (p->name, "signed")
! 			    && strcmp (p->name, "inline"))))
! 		  value = (int) p->token;
! 		break;
! 	      }
! 	}
  
        /* If we did not find a keyword, look for an identifier
  	 (or a typename).  */
--- 1872,1900 ----
        value = IDENTIFIER;
        yylval.itype = 0;
  
!       /* Try to recognize a keyword.  Uses minimum-perfect hash function */
!   
!       {
! 	register struct resword *ptr;
  
! 	if (ptr = is_reserved_word (token_buffer,p - token_buffer))
! 	  {
! 	    if (ptr->rid)
! 	      yylval.ttype = ridpointers[(int) ptr->rid];
! 	    if ((! flag_no_asm
! 		 || ((int) ptr->token != ASM
! 		     && (int) ptr->token != TYPEOF
! 		     && ptr->rid != RID_INLINE)) /* why waste a strcmp? */
! 		/* -ftraditional means don't recognize
! 		   typeof, const, volatile, signed or inline.  */
! 		&& (! flag_traditional
! 		    || ((int) ptr->token != TYPE_QUAL
! 			&& (int) ptr->token != TYPEOF
! 			&& ptr->rid != RID_SIGNED /* again, no need for */
! 			&& ptr->rid != RID_INLINE))) /* a function call.   */
! 	      value = (int) ptr->token;
! 	  }
!       }
  
        /* If we did not find a keyword, look for an identifier
  	 (or a typename).  */