--- ./languages/english/Makefile Thu Oct 12 12:04:08 1995 +++ ../ispell-3.1/./languages/english/Makefile Tue Jan 4 08:21:33 2000 @@ -436,8 +436,8 @@ set -x; \ PATH=$(PATHADDER):$$PATH; \ export PATH; \ - munchlist -v -l $(AFFIXES) $$dicts \ - > english.med+ \ + munchlist -l $(AFFIXES) $$dicts \ + 1> english.med+ 2> /dev/null \ || rm -f english.med+ test -s english.med+ \ || (echo 'error: zero-length dictionary generated'; \ --- ./correct.c Thu Oct 12 12:04:06 1995 +++ ../ispell-3.1/./correct.c Tue Jan 4 08:16:24 2000 @@ -50,6 +50,13 @@ /* * $Log: correct.c,v $ + * + * HTML-entities added by Casper Maarbjerg, 1997/05/16 as listed in + * http://uts.cc.utexas.edu/~churchh/latin1.html + * + * Line added by Gerry Tierney to reset insidehtml flag for each new + * file in case a tag was left open by a previous file. 10/14/95 + * * Revision 1.59 1995/08/05 23:19:43 geoff * Fix a bug that caused offsets for long lines to be confused if the * line started with a quoting uparrow. @@ -233,6 +240,9 @@ int bufsize; int ch; + /* line added by Gerry Tierney */ + insidehtml = 0; + for (bufno = 0; bufno < contextsize; bufno++) contextbufs[bufno][0] = '\0'; @@ -295,7 +305,11 @@ char * start_l2; char * begintoken; +#ifdef HTSPECIAL + begintoken = ctok_start == NULL ? contextbufs[0] : ctok_start; +#else begintoken = *curchar - strlen (ctok); +#endif if (icharlen (itok) <= minword) return; /* Accept very short words */ @@ -374,7 +388,11 @@ if (start_l2 < contextbufs[0]) start_l2 = contextbufs[0]; } +#ifdef HTSPECIAL + show_line (start_l2, begintoken, *curchar - begintoken ); +#else show_line (start_l2, begintoken, (int) strlen (ctok)); +#endif if (minimenusize != 0) { @@ -594,6 +612,16 @@ ichar = SET_SIZE + laststringch; else ichar = chartoichar (ch); +#ifdef HTSPECIAL + if (htmlflag == 1 && ch == '&' && !vflag && len == 1) + { + ch = html_ent(cp); + if (output) + (void) putchar (ch); + return 1; + } + else +#endif if (!vflag && iswordch (ichar) && len == 1) { if (output) @@ -1604,6 +1632,11 @@ if (**cc == '\0') break; if (!aflag && !lflag) +#ifdef HTSPECIAL + if (htmlflag == 1 && (unsigned char)**cc >= FIRST_ISO) + fprintf(outfile, iso_ent[(unsigned char) **cc - FIRST_ISO]); + else +#endif (void) putc (**cc, outfile); (*cc)++; } --- ./ispell.c Thu Oct 12 12:04:07 1995 +++ ../ispell-3.1/./ispell.c Tue Jan 4 08:16:24 2000 @@ -49,6 +49,12 @@ /* * $Log: ispell.c,v $ + * + * Modifications made by Gerry Tierney to + * allow checking of html code. Adds -h switch and checking for + * html files by .html or .htm extension. + * 14th of October 1995 + * * Revision 1.133 1995/10/11 04:30:29 geoff * Get rid of an unused variable. * @@ -298,7 +304,9 @@ * ABCDEFGHIJKLMNOPQRSTUVWXYZ 0123456789 * ^^^^ ^^^ ^ ^^ ^^ * abcdefghijklmnopqrstuvwxyz - * ^^^^^^ ^^^ ^ ^^ ^^^ + * ^^^^^^ ^ ^^^ ^ ^^ ^^^ + * + * -h flag used by Gerry Tierney for html-mode */ arglen = strlen (*argv); switch ((*argv)[1]) @@ -438,6 +446,9 @@ (void) printf ("\tNO8BIT\n"); #else /* NO8BIT */ (void) printf ("\t!NO8BIT (8BIT)\n"); +#ifdef HTSPECIAL + (void) printf ("\tHTSPECIAL \"(ISO-HTML mode)\"\n"); +#endif #endif /* NO8BIT */ (void) printf ("\tNRSPECIAL = \"%s\"\n", NRSPECIAL); (void) printf ("\tOLDPAFF = \"%s\"\n", OLDPAFF); @@ -488,6 +499,7 @@ if (arglen > 2) usage (); tflag = 0; /* nroff/troff mode */ + htmlflag = -1; /* non-html mode */ deftflag = 0; if (preftype == NULL) preftype = "nroff"; @@ -496,10 +508,19 @@ if (arglen > 2) usage (); tflag = 1; + htmlflag = -1; /* non-html mode */ deftflag = 1; if (preftype == NULL) preftype = "tex"; break; + /* -h option to enable HTML-mode added by Gerry Tierney */ + case 'h': + if (arglen > 2) + usage (); + tflag = 0; /* non-TeX mode */ + deftflag = 0; + htmlflag = 1; /* Html-Mode */ + break; case 'T': /* Set preferred file type */ p = (*argv)+2; if (*p == '\0') @@ -810,7 +831,7 @@ if (tflag < 0) tflag = (cp = rindex (filename, '.')) != NULL && strcmp (cp, ".tex") == 0; - + if (prefstringchar < 0) { defdupchar = @@ -818,6 +839,13 @@ if (defdupchar < 0) defdupchar = 0; } + /* Modification by Gerry Tierney to set hmtl-mode + * based on file extension */ + if (htmlflag == 0) + htmlflag = + (cp = rindex (filename, '.')) != NULL && + ( strcmp (cp, ".html") == 0 || + strcmp (cp, ".htm") == 0); if ((infile = fopen (filename, "r")) == NULL) { --- ./ispell.h Thu Oct 12 12:04:08 1995 +++ ../ispell-3.1/./ispell.h Tue Jan 4 08:16:24 2000 @@ -42,6 +42,16 @@ /* * $Log: ispell.h,v $ + * + * Patch by Casper Maarbjerg, http://www.nyx.net/~cmaarbj/ + * 1997/05/19, for ISO HTML-entity conversion in html mode. + * Added variable ctok_start to hold the start of raw html word. + * changes wrapped in "#ifdef HTSPECIAL". + * + * Patch by Gerry Tierney + * 1995/10/14 + * Added variables htmlflag and insidehtml for use in html-mode + * * Revision 1.68 1995/03/06 02:42:41 geoff * Be vastly more paranoid about parenthesizing macro arguments. This * fixes a bug in defmt.c where a complex argument was passed to @@ -623,6 +633,26 @@ INIT (int deftflag, -1); /* NZ for TeX mode by default */ INIT (int tflag, DEFTEXFLAG); /* NZ for TeX mode in current file */ INIT (int prefstringchar, -1); /* Preferred string character type */ +/* The following two definitions added by + * Gerry Tierney + * 14th Oct 95 + */ +INIT (int htmlflag, 0); /* HTML-checking state. + * 1=enable html-mode, + * 0=enable html-mode based on filename, + * -1=disable html-mode */ +INIT (int insidehtml, 0); /* Flag to indicate that the current html + * tag has spanned more than one line */ +/* End of Gerry's Interference */ +#ifdef HTSPECIAL /* decode "&#;" for HTML-ISO characters */ +#ifdef NO8BIT +#error HTSPECIAL requires NO8BIT to be undefined ! +#endif +#define FIRST_ISO 160 /* First 8-bit code of valid HTML entities */ +extern char *iso_ent[]; /* HTML entities defined in defmt.c */ +extern int html_ent P ((char **in)); +INIT (char *ctok_start, NULL); /* Remember start of raw HTML word */ +#endif INIT (int terse, 0); /* NZ for "terse" mode */ --- ./defmt.c Thu Oct 12 12:04:06 1995 +++ ../ispell-3.1/./defmt.c Tue Jan 4 08:16:24 2000 @@ -54,6 +54,12 @@ /* * $Log: defmt.c,v $ + * ISO-character de-/en- coding in html mode added 1997/05/16 + * by Casper Maarbjerg, http://www.nyx.net/~cmaarbj/ + * + * html-mode code added by Gerry Tierney + * 14th of Oct '95 + * * Revision 1.41 1995/08/05 23:19:47 geoff * Get rid of an obsolete comment. Add recognition of documentclass and * usepackage for Latex2e support. @@ -140,6 +146,7 @@ static void TeX_open_paren P ((char ** bufp)); static void TeX_skip_check P ((char ** bufp)); static int TeX_strncmp P ((char * a, char * b, int n)); +char * htmlword P ((unsigned char *source)); #define ISTEXTERM(c) (((c) == TEXLEFTCURLY) || \ ((c) == TEXRIGHTCURLY) || \ @@ -160,6 +167,25 @@ static int save_math_mode; static char save_LaTeX_Mode; +static char *skiptag(buf, tagend, taglen) /* Skip past specific tag */ + char * buf; + char * tagend; + int taglen; + { + while(*buf) + { + if (*buf != *tagend && ++buf) + continue; + if (strncasecmp(buf, tagend, taglen) && ++buf) + continue; + buf += taglen; + insidehtml = 0; + break; + } + return(buf); + } + +/* parameters changed by Gerry Tierney to include the output file */ static char * skiptoword (bufp) /* Skip to beginning of a word */ char * bufp; { @@ -170,6 +196,82 @@ || (tflag && (math_mode & 1))) ) { + /* Start of modifications by Gerry Tierney */ + /* We first check for an end-quote character if we are checking + inside of an alt attribute. If we find one we ignore the + rest of the tag */ + if (insidehtml == -1 && *bufp == '\"') + { + insidehtml = 0; + while (*bufp != '>' && *bufp != '\0') + bufp++; + if (*bufp == '\0') + insidehtml = 1; + } + /* If we are checking a html file we want to ignore any + HTML tags. These should start with a '<' + and end with a '>' so we simply skip over anything + between these two symbols. If we reach the end of the line + before finding a matching '>' we set a flag 'insidehtml' */ + if (htmlflag == 1 && *bufp == '<') + { + /* Found start of html tag, if it is a script tag, + * skip until end of script */ + if (insidehtml == 2 || (strncasecmp(bufp,"", 9); + } + /* It could also be a comment, containing a '>', so it + * seems safer to skip until the first end-of-comment.. */ + else if (insidehtml == 3 || (strncmp(bufp,"", 3); + } + else { + /* Found start of html tag - Skip to end of tag or EOL */ + while (*bufp != '>' && *bufp != '\0' && + strncasecmp(bufp,"alt=\"",5) != 0) + bufp++; + /* If we find an alt tag, we want to check its text */ + if (strncasecmp(bufp,"alt=\"",5) == 0) + { + insidehtml=-1; + bufp = bufp + 4; + } + else if (*bufp == '\0') + /* we've reached EOL without closing the tag */ + insidehtml = 1; + } + } +#ifndef HTSPECIAL + /* HTSPECIAL characters _NOT_ defined, so... + */ + /* Skip over quoted entities such as " + These all start with an ampersand and + end with a semi-colon. We do not need + to worry about them extending over more than one line */ + if (htmlflag == 1 && *bufp == '&') + { + while (*bufp && *bufp != ';' && *bufp != ' ') + bufp++; + } +#else + if (htmlflag == 1 && *bufp == '&') + { + char *cp2 = bufp; + + if (html_ent(&cp2) >= FIRST_ISO) + break; + if (!skip_ent(&bufp)) + bufp++; + continue; + } +#endif + /* End of modifications by Gerry Tierney */ + + /* check paren necessity... */ if (tflag) /* TeX or LaTeX stuff */ { @@ -329,6 +431,24 @@ lastboundary = NULL; for ( ; ; ) { +#ifdef HTSPECIAL + if (htmlflag == 1 && *bufp == '&') + { + char *cp2 = bufp; /* Avoid compiler complaints ... */ + /* .. about taking address of register bufp */ + if (html_ent(&cp2) < FIRST_ISO) + { + lastboundary = bufp; + bufp = cp2; + break; + } + else + { + lastboundary = NULL; + bufp = cp2; + } + } +#endif if (*bufp == '\0') { if (TeX_comment) @@ -389,7 +509,8 @@ if (hadlf) contextbufs[0][len] = 0; - if (!tflag) + /* Conditions modified by Gerry Tierney to handle html-mode */ + if (!tflag && htmlflag != 1) { /* skip over .if */ if (*currentchar == NRDOT @@ -426,7 +547,8 @@ /* if this is a formatter command, skip over it */ - if (!tflag && *currentchar == NRDOT) + /* Conditions modified by Gerry Tierney to handle html-mode */ + if (!tflag && htmlflag != 1 && *currentchar == NRDOT) { while (*currentchar && !myspace (chartoichar (*currentchar))) { @@ -441,10 +563,47 @@ return; } } + /* Start of modifications by Gerry Tierney */ + /* If we are checking a htmlfile and we have being left with + an open tag from a previous line, then we ignore everything + from the start of the line until we either reach the end of + the line or we close the tag */ + if (htmlflag == 1) + { + if (insidehtml == 1) + while (*currentchar != '>' && *currentchar != '\0') + { + /* We check for an alt attribute (found inside img + tags). We want to spell check it's text so if + we find one, we switch out html-mode until we + find the next quote character. We signal this + state by setting the insidehtml flag to -1 */ + if (strncasecmp(currentchar,"alt=\"",5) == 0) + { + copyout(¤tchar,5); + insidehtml = -1; + break; + } + (void) putc (*currentchar, ofile); + currentchar++; + } + else if (insidehtml == 2) /* filtering javascript */ + currentchar = skiptag(currentchar, "", 9); + else if (insidehtml == 3) /* filtering comments */ + currentchar = skiptag(currentchar, "-->", 3); + else if (*currentchar == '>') + /* We've closed the tag so we reset the flag */ + insidehtml = 0; + } + /* End of modifications by Gerry Tierney */ + for ( ; ; ) { p = skiptoword (currentchar); +#ifdef HTSPECIAL + ctok_start = p; +#endif if (p != currentchar) copyout (¤tchar, p - currentchar); @@ -453,6 +612,23 @@ p = ctoken; endp = skipoverword (currentchar); +#ifdef HTSPECIAL + if (htmlflag == 1) /* We are honoring the ISO-HTML entities, */ + { /* and have to convert to ISO before lookup */ + while (currentchar < endp && p < ctoken + sizeof ctoken - 1) + { + if (*currentchar == '&') + { + *p++ = html_ent(¤tchar); + if (currentchar > endp) + currentchar = endp; + } + else + *p++ = *currentchar++; + } + } + else +#endif while (currentchar < endp && p < ctoken + sizeof ctoken - 1) *p++ = *currentchar++; *p = 0; @@ -545,6 +721,11 @@ } } if (!aflag && !lflag) +#ifdef HTSPECIAL + if (htmlflag == 1) /* Translate into output file */ + (void) fprintf (ofile, "%s", htmlword(ctoken)); + else +#endif (void) fprintf (ofile, "%s", ctoken); } @@ -899,3 +1080,178 @@ } return cmpresult; } + + +#ifdef HTSPECIAL + +/* + * Code to convert from / to ISO HTML-entities. + * + * Decoding of alphabetic entities is performed by two table lookups, + * one for each of the first two characters after the `&'. + * + * The first lookup decides which string to use for the second lookup, + * and if both match, the corresponding position in the isochar array + * holds the character value. + * + * After the 8-bit value is determined, the input is verified against + * the iso_ent array, using strncmp(), and in case of mismatch the + * function returns the input character unconverted. + * + * The alternate numeric form of &#nnn; is also decoded by atoi, and + * checked for sanity, but will be converted to the name-form on output. + * + * Encoding is performed by htmlword on characters between FIRST_ISO and 255, + * and the iso_ent table must hold an entry for each. + */ +static char *Y_key = "ACEINOTUYsaceinotuy"; /* Primary key */ + +static char *X_key[] = { /* Secondary key: */ + "gacturE", + "c", + "gacuT", + "gacu", + "t", + "gactus", + "h", + "gacu", + "a", + "z", + "gacture", + "c", + "gacut", + "gacu", + "t", + "gactus", + "h", + "gacu", + "au" +}; + +static unsigned char *isochar[] = { /* 8-bit values of above table */ + "\300\301\302\303\304\305\306", + "\307", + "\310\311\312\313\320", + "\314\315\316\317", + "\321", + "\322\323\324\325\326\330", + "\336", + "\331\332\333\334", + "\335", + "\337", + "\340\341\342\343\344\345\346", + "\347", + "\350\351\352\353\360", + "\354\355\356\357", + "\361", + "\362\363\364\365\366\370", + "\376", + "\371\372\373\374", + "\375\377", +}; + +/* + * Reference: http://uts.cc.utexas.edu/~churchh/latin1.html + */ +char *iso_ent[] = { /* Valid HTML characters above 160 in numerical order */ + " ", "¡", "¢", "£", "¤", "¥", + "¦", "§", "¨", "©", "ª", "«", + "¬", "­", "®", "¯", "°", "±", + "²", "³", "´", "µ", "¶", "·", + "¸", "¹", "º", "»", "¼", "½", + "¾", "¿", + "À", "Á", "Â", "Ã", "Ä", "Å", "Æ", + "Ç", + "È", "É", "Ê", "Ë", + "Ì", "Í", "Î", "Ï", + "Ð", "Ñ", + "Ò", "Ó", "Ô", "Õ", "Ö", "×", "Ø", + "Ù", "Ú", "Û", "Ü", + "Ý", "Þ", "ß", + "à", "á", "â", "ã", "ä", "å", "æ", + "ç", + "è", "é", "ê", "ë", + "ì", "í", "î", "ï", + "ð", "ñ", + "ò", "ó", "ô", "õ", "ö", "÷", "ø", + "ù", "ú", "û", "ü", + "ý", "þ", "Ÿ" +}; + +/* Increment pointer past ignored entity, returning nonzero on success + */ +int skip_ent(char **entity) +{ + char **cpp; + int j, match = 0; + + if (strncmp(*entity, "<" , 4) == 0 || strncmp(*entity, ">" , 4) == 0) + match = 3; + else if (strncmp(*entity, "&" , 5) == 0) + match = 4; + else if (strncmp(*entity, """ , 6) == 0) + match = 5; + else if (strncmp(*entity, " " , 6) == 0) + match = 5; + if (match) + *entity += match; + return(match); +} + +/* Return 8-bit value of valid html-entity pointed to by *in, incrementing + * the pointer by the length of the tag. + * Only the first two alpha characters after '&' is tested, then the + * decoded char is verified against the iso_ent array. + */ +int html_ent(char **in) + { + char *cp, *cp2, ch; + char *decoded; + int row, val, taglen = 1; + + cp = cp2 = *in; + val = ch = **in; + + if (*++cp && *cp == '#' && (*++cp == '1' || *cp == '2') && (val = atoi(cp))) + { + if (val > 255 || *++cp < '0' || *cp > '5' || *++cp < '0' || *cp > '9' || *++cp!=';') + val = 0; + else + taglen = 6; /* Validate numeric tag */ + } + else + { + if ((cp = index(Y_key, *++cp2)) && + (decoded = index(*(X_key+(row = cp-Y_key)), *++cp2)) && + (val = isochar[row] [ decoded - X_key[row]]) >= FIRST_ISO) + taglen = strlen(iso_ent[val - FIRST_ISO]); + if (val