,

/* ** Copyright 1998 - 2002 Double Precision, Inc. See COPYING for ** distribution information. */ /* ** $Id: html.c,v 1.17 2002/09/25 12:21:55 mrsam Exp $ */ #include #include #include #include #include "config.h" #include "cgi/cgi.h" #include "sqwebmail.h" #include "rfc2045/rfc2045.h" void decodehtmlchar(char *p) { char *q; for (q=p; *p; ) { int i; if (*p != '&') { *q++=*p++; continue; } if ( p[1] == '#') { unsigned c=0; for (p += 2; isdigit((int)(unsigned char)*p); p++) c=c * 10 + (*p++ - '0'); c=(unsigned char)c; if (c) *q++=c; if (*p == ';') p++; } for (i=1; p[i]; i++) if (!isalpha((int)(unsigned char)p[i])) break; if (p[i] != ';') { *q++=*p++; continue; } for (i=0; p[i] != ';'; i++) p[i]=tolower(p[i]); ++i; if (strncmp(p, "<", 4) == 0) { *q++ = '<'; } else if ( strncmp(p, ">",4) == 0) { *q++ = '>'; } else if ( strncmp(p, "&", 5) == 0) { *q++ = '&'; } else if ( strncmp(p, """, 6) == 0) { *q++ = '"'; } p += i; } *q=0; } /* HTML sanitization filter. Transforms HTML as follows: The following tags are dropped: , , , , , , ,

, , , , , , , , , , , , . The ONLOAD, ONMOUSEOVER, and all other ON* attributes are removed. Attributes TARGET, CODE, ACTION, CODETYPE and LANGUAGE are removed. TARGET=_blank is added to all tags. HREF, SRC, or LOWSRC attributes that do not specify a URL of http:, https:, ftp:, gopher:, wais:, or telnet:, are removed. */ char *tagbuf=0; size_t tagbufsize=0, tagbuflen; struct tagattrinfo { char *tagname; size_t tagnamelen; char *tagvalue; size_t tagvaluelen; char *atagstart; /* Entire tag=value location */ size_t ataglen; } ; struct tagattrinfo *tagattr=0; size_t tagattrsize=0, tagattrlen; static void addtagbuf(int c) { if (tagbufsize >= 1024) return; /* DOS attack - get rid of the tag */ if (tagbuflen >= tagbufsize) { char *newtagbuf= tagbuf ? (char *)realloc(tagbuf, tagbufsize+256) :(char *)malloc(tagbufsize+256); if (!newtagbuf) enomem(); tagbuf=newtagbuf; tagbufsize += 256; } tagbuf[tagbuflen++]=c; } /* Parse the contents of tagbuf into individual attributes. If argument is ** NULL, just the count of attributes is returned. That's the first pass. ** On the second pass the argument points to a struct tagattrinfo array which ** we initialize. ** ** The first attribute is -- obviously -- the actual tag. */ static size_t parseattr(struct tagattrinfo *tai) { size_t c=0; char *p; for (p=tagbuf; *p; ) { while (*p && isspace((int)(unsigned char)*p)) p++; if (!*p) break; ++c; if (tai) { tai->tagname=p; tai->tagnamelen=0; tai->atagstart=p; } while (*p && !isspace((int)(unsigned char)*p) && *p != '=') { ++p; if (tai) ++tai->tagnamelen; } if (*p != '=') /* No attribute value */ { if (tai) { tai->tagvalue=0; tai->tagvaluelen=0; } } else { char c; ++p; if ((c=*p) == '"' || c == '\'') /* Attr value in quotes */ { ++p; if (tai) { tai->tagvalue=p; tai->tagvaluelen=0; } while (*p && *p != (char)c) { ++p; if (tai) ++tai->tagvaluelen; } if (*p) p++; } else { if (tai) { tai->tagvalue=p; tai->tagvaluelen=0; } while (*p && !isspace((int)(unsigned char)*p)) { p++; if (tai) { tai->tagvalue=p; tai->tagvaluelen=0; } } } } if (tai) { tai->ataglen=p-tai->atagstart; ++tai; } } return (c); } static void parsetagbuf() { char *p; while ((p=strchr(tagbuf, '<')) != NULL) *p=' '; tagattrlen=parseattr(0); if ( tagattrlen > tagattrsize) { struct tagattrinfo *newta= tagattr ? (struct tagattrinfo *) realloc(tagattr, (tagattrlen+16)*sizeof(*tagattr)) :(struct tagattrinfo *) malloc((tagattrlen+16)*sizeof(*tagattr)); if (!newta) enomem(); tagattrsize=tagattrlen+16; tagattr=newta; } parseattr(tagattr); } /* See if this attribute is the one we're looking for */ static int is_attr(struct tagattrinfo *i, const char *l) { size_t ll=strlen(l); return (i->tagnamelen == ll && strncasecmp(i->tagname, l, ll) == 0); } /* If this is the tag we're looking for */ static int is_htmltag(const char *l) { return (tagattrlen ? is_attr(tagattr, l):0); } /* See if the attribute value starts with what we're looking for */ static int is_valuestart(const char *v, const char *l) { while (v && isspace((int)(unsigned char)*v)) ++v; return (v && strncasecmp(v, l, strlen(l)) == 0); } /* htmlfilter() is repeatedly called to filter the HTML text. htmlfilter() will call htmlfiltered() with the filtered text, more or less on a one to one basis. htmlfilter_init() must be called before the first invocation of htmlfilter(). Because the HTML can be fed in arbitrary quantities, htmlfilter() implements a state machine which htmlfilter_init() initializes. */ enum htmlstate { intext, /* Initial value. In plain text */ seenlt, /* Seen < */ seenltbang, /* Seen ') { (*htmlfiltered_func)(p+start, l-start); (*htmlfiltered_func)(" ", 6); start=l+1; } if (p[l] != '<') continue; if (!instyletag) (*htmlfiltered_func)(p+start, l-start); /* Output everything up until the tag */ cur_state=seenlt; tagbuflen=0; break; case seenlt: if (p[l] == '>') { cur_state=intext; start=l+1; if (!instyletag) (*htmlfiltered_func)("<>", 2); /* Eh? */ continue; } if (isspace((int)(unsigned char)p[l])) break; if (p[l] == '!') cur_state=seenltbang; else if (p[l] != '/' && !isalpha((int)(unsigned char)p[l])) { start=l+1; cur_state=intext; break; } else cur_state=intag; addtagbuf(p[l]); break; case intag: /* We're in a tag (not a