Nick Kew wrote:
On Wed, 08 Nov 2006 12:56:28 -0500 mickg <mickg@xxxxxxxxx> wrote:Do you want the full working code once I clean up the memory problem? It is, after all, GPL, so it would be in good spirit for me to release the modified source. :)Yes please. I haven't thought through whether to incorporate this or something similar. If I do, I'll want to base it on apr_iconv, rather than native iconv. But having your code there to look at can't hurt, regardless of what I end up doing.
Attached. Code compiles on Ubuntu, assuming apache-dev, libxml2-dev, and a ln -s /usr/include/libxml2/libxml /usr/include/libxml apxs2 -i -c mod_proxy_html.c No warnings on the new functions are emitted. I am now using it on a webserver, and will say tomorrow whether there are any major memory leaks (A decent amount of traffic is going through it). Essential Missing: Rewriting of get & post request data. The reason for using iconv, and not apache's iconv: libxml already opens the iconv handle during initialization. Might as well use it. Standard discalimers apply. Code is GPL, my modifications are, for WebThing's use, BSDed. TODO list: Add rewriting of POST/GET requests. Add directive to set default encoding if non availible (once I figure out how to add directives). Add directive to set output encoding (and convert to it) (once I figure out how to modify data post-processing) Maybe make a mod_charset_libxml charset converter. As the mod_charset_light is not working, and I am not sure I want to fix that. ( For the record, *why oh why* are we doing text munging in C/C++ ? As someone who coded in C a long, long time ago , I find I am much more productive in various HLLs, such as Python. This, of course, excepts kernel code. I have half a mind to make a Python, Perl, or Lisp-based filter. ) mickg
/******************************************************************** Copyright (c) 2003-5, WebThing Ltd Author: Nick Kew <nick@xxxxxxxxxxxx> Modifier: Michael Gorbovitski <mickg@xxxxxxxxx> 2006 This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *********************************************************************/ /******************************************************************** Note to Users You are requested to register as a user, at http://apache.webthing.com/registration.html This entitles you to support from the developer. I'm unlikely to reply to help/support requests from non-registered users, unless you're paying and/or offering constructive feedback such as bug reports or sensible suggestions for further development. It also makes a small contribution to the effort that's gone into developing this work. *********************************************************************/ /* End of Notices */ /* GO_FASTER You can #define GO_FASTER to disable informational logging. This disables the ProxyHTMLLogVerbose option altogether. Default is to leave it undefined, and enable verbose logging as a configuration option. Binaries are supplied with verbose logging enabled. */ #ifdef GO_FASTER #define VERBOSE(x) #else #define VERBOSE(x) if ( verbose ) x #endif #define VERSION_STRING "proxy_html/2.5" #include <ctype.h> /* libxml */ #include <libxml/HTMLparser.h> /* apache */ #include <http_protocol.h> #include <http_config.h> #include <http_log.h> #include <apr_strings.h> /* To support Apache 2.1/2.2, we need the ap_ forms of the * regexp stuff, and they're now used in the code. * To support 2.0 in the same compile, * we #define the * AP_ versions if necessary. */ #ifndef AP_REG_ICASE /* it's 2.0, so we #define the ap_ versions */ #define ap_regex_t regex_t #define ap_regmatch_t regmatch_t #define AP_REG_EXTENDED REG_EXTENDED #define AP_REG_ICASE REG_ICASE #define AP_REG_NOSUB REG_NOSUB #define AP_REG_NEWLINE REG_NEWLINE #endif module AP_MODULE_DECLARE_DATA proxy_html_module ; #define M_HTML 0x01 #define M_EVENTS 0x02 #define M_CDATA 0x04 #define M_REGEX 0x08 #define M_ATSTART 0x10 #define M_ATEND 0x20 #define M_LAST 0x40 typedef struct { unsigned int start ; unsigned int end ; } meta ; typedef struct urlmap { struct urlmap* next ; unsigned int flags ; union { const char* c ; ap_regex_t* r ; } from ; const char* to ; } urlmap ; typedef struct { urlmap* map ; const char* doctype ; const char* etag ; unsigned int flags ; int extfix ; int metafix ; int strip_comments ; #ifndef GO_FASTER int verbose ; #endif size_t bufsz ; } proxy_html_conf ; typedef struct { htmlSAXHandlerPtr sax ; ap_filter_t* f ; proxy_html_conf* cfg ; htmlParserCtxtPtr parser ; apr_bucket_brigade* bb ; char* buf ; size_t offset ; size_t avail ; const char * enc_from; } saxctxt ; static int is_empty_elt(const char* name) { const char** p ; static const char* empty_elts[] = { "br" , "link" , "img" , "hr" , "input" , "meta" , "base" , "area" , "param" , "col" , "frame" , "isindex" , "basefont" , NULL } ; for ( p = empty_elts ; *p ; ++p ) if ( !strcmp( *p, name) ) return 1 ; return 0 ; } typedef struct { const char* name ; const char** attrs ; } elt_t ; #define NORM_LC 0x1 #define NORM_MSSLASH 0x2 #define NORM_RESET 0x4 typedef enum { ATTR_IGNORE, ATTR_URI, ATTR_EVENT } rewrite_t ; static void normalise(unsigned int flags, char* str) { xmlChar* p ; if ( flags & NORM_LC ) for ( p = str ; *p ; ++p ) if ( isupper(*p) ) *p = tolower(*p) ; if ( flags & NORM_MSSLASH ) for ( p = strchr(str, '\\') ; p ; p = strchr(p+1, '\\') ) *p = '/' ; } #define FLUSH ap_fwrite(ctx->f->next, ctx->bb, (chars+begin), (i-begin)) ; begin = i+1 static void pcharacters(void* ctxt, const xmlChar *chars, int length) { saxctxt* ctx = (saxctxt*) ctxt ; int i ; int begin ; for ( begin=i=0; i<length; i++ ) { switch (chars[i]) { case '&' : FLUSH ; ap_fputs(ctx->f->next, ctx->bb, "&") ; break ; case '<' : FLUSH ; ap_fputs(ctx->f->next, ctx->bb, "<") ; break ; case '>' : FLUSH ; ap_fputs(ctx->f->next, ctx->bb, ">") ; break ; case '"' : FLUSH ; ap_fputs(ctx->f->next, ctx->bb, """) ; break ; default : break ; } } FLUSH ; } static void preserve(saxctxt* ctx, const size_t len) { char* newbuf ; if ( len <= ( ctx->avail - ctx->offset ) ) return ; else while ( len > ( ctx->avail - ctx->offset ) ) ctx->avail += ctx->cfg->bufsz ; newbuf = realloc(ctx->buf, ctx->avail) ; if ( newbuf != ctx->buf ) { if ( ctx->buf ) apr_pool_cleanup_kill(ctx->f->r->pool, ctx->buf, (void*)free) ; apr_pool_cleanup_register(ctx->f->r->pool, newbuf, (void*)free, apr_pool_cleanup_null); ctx->buf = newbuf ; } } size_t ConvertInput (const char *in, const char **newbuf, size_t size, request_rec *r, const char *encoding) { char *out; char *oldout; size_t ret; size_t out_size; size_t temp; xmlCharEncodingHandlerPtr handler; if (in == 0) return 0; handler = xmlFindCharEncodingHandler (encoding); if (!handler) { ap_log_rerror (APLOG_MARK, APLOG_INFO, 0, r, "ConvertInput: no encoding handler found for %s", encoding); return 0; } out_size = (size + 1) * 2 -1 ; out = apr_palloc (r->pool, (size_t) out_size); oldout = out; if (out != 0) { temp = size; if (handler->input) { ap_log_rerror (APLOG_MARK, APLOG_INFO, 0, r, "Converting using libxml2!"); ret = handler->input ((unsigned char *)out, (int*)&out_size, (unsigned char *)in, (int*)&temp); } else { ap_log_rerror (APLOG_MARK, APLOG_INFO, 0, r, "Converting using iconv!"); ret = iconv (handler->iconv_in, (char**)&in, &temp, &out, &out_size); } if ((ret < 0)) { ap_log_rerror (APLOG_MARK, APLOG_INFO, 0, r, "ConvertInput: conversion wasn't succesful!, %d out of %d left to convert!", temp, size); out = 0; out_size = -1; } else { out=oldout; out_size = ((size + 1) * 2 -1) - out_size; out[out_size] = 0; /*null terminating out */ } } else { ap_log_rerror (APLOG_MARK, APLOG_ERR, 0, r, "No memory!"); } *newbuf = out; return out_size; } static void pappend(saxctxt* ctx, const char* buf, const size_t len) { preserve(ctx, len) ; memcpy(ctx->buf+ctx->offset, buf, len) ; ctx->offset += len ; } static void dump_content(saxctxt* ctx) { urlmap* m ; char* found ; size_t s_from, s_to ; size_t match ; char c = 0 ; int nmatch ; ap_regmatch_t pmatch[10] ; char* subs ; size_t len, offs ; #ifndef GO_FASTER int verbose = ctx->cfg->verbose ; #endif pappend(ctx, &c, 1) ; /* append null byte */ /* parse the text for URLs */ for ( m = ctx->cfg->map ; m ; m = m->next ) { if ( ! ( m->flags & M_CDATA ) ) continue ; if ( m->flags & M_REGEX ) { nmatch = 10 ; offs = 0 ; while ( ! ap_regexec(m->from.r, ctx->buf+offs, nmatch, pmatch, 0) ) { match = pmatch[0].rm_so ; s_from = pmatch[0].rm_eo - match ; subs = ap_pregsub(ctx->f->r->pool, m->to, ctx->buf+offs, nmatch, pmatch) ; s_to = strlen(subs) ; len = strlen(ctx->buf) ; offs += match ; VERBOSE( { const char* f = apr_pstrndup(ctx->f->r->pool, ctx->buf + offs , s_from ) ; ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, ctx->f->r, "C/RX: match at %s, substituting %s", f, subs) ; } ) if ( s_to > s_from) { preserve(ctx, s_to - s_from) ; memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from, len + 1 - s_from - offs) ; memcpy(ctx->buf+offs, subs, s_to) ; } else { memcpy(ctx->buf + offs, subs, s_to) ; memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from, len + 1 - s_from - offs) ; } offs += s_to ; } } else { s_from = strlen(m->from.c) ; s_to = strlen(m->to) ; for ( found = strstr(ctx->buf, m->from.c) ; found ; found = strstr(ctx->buf+match+s_to, m->from.c) ) { match = found - ctx->buf ; if ( ( m->flags & M_ATSTART ) && ( match != 0) ) break ; len = strlen(ctx->buf) ; if ( ( m->flags & M_ATEND ) && ( match < (len - s_from) ) ) continue ; VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, ctx->f->r, "C: matched %s, substituting %s", m->from.c, m->to) ) ; if ( s_to > s_from ) { preserve(ctx, s_to - s_from) ; memmove(ctx->buf+match+s_to, ctx->buf+match+s_from, len + 1 - s_from - match) ; memcpy(ctx->buf+match, m->to, s_to) ; } else { memcpy(ctx->buf+match, m->to, s_to) ; memmove(ctx->buf+match+s_to, ctx->buf+match+s_from, len + 1 - s_from - match) ; } } } } ap_fputs(ctx->f->next, ctx->bb, ctx->buf) ; } static void pcdata(void* ctxt, const xmlChar *chars, int length) { saxctxt* ctx = (saxctxt*) ctxt ; if ( ctx->cfg->extfix ) { pappend(ctx, chars, length) ; } else { ap_fwrite(ctx->f->next, ctx->bb, chars, length) ; } } static void pcomment(void* ctxt, const xmlChar *chars) { saxctxt* ctx = (saxctxt*) ctxt ; if ( ctx->cfg->strip_comments ) return ; if ( ctx->cfg->extfix ) { pappend(ctx, "<!--", 4) ; pappend(ctx, chars, strlen(chars) ) ; pappend(ctx, "-->", 3) ; } else { ap_fputstrs(ctx->f->next, ctx->bb, "<!--", chars, "-->", NULL) ; } } static void pendElement(void* ctxt, const xmlChar* name) { saxctxt* ctx = (saxctxt*) ctxt ; if ( ctx->offset > 0 ) { dump_content(ctx) ; ctx->offset = 0 ; /* having dumped it, we can re-use the memory */ } if ( ! is_empty_elt(name) ) ap_fprintf(ctx->f->next, ctx->bb, "</%s>", name) ; } static void pstartElement(void* ctxt, const xmlChar* name, const xmlChar** attrs ) { int num_match ; size_t offs, len ; char* subs ; rewrite_t is_uri ; const char** linkattrs ; const xmlChar** a ; const elt_t* elt ; const char** linkattr ; urlmap* m ; size_t s_to, s_from, match ; char* found ; saxctxt* ctx = (saxctxt*) ctxt ; size_t nmatch ; ap_regmatch_t pmatch[10] ; #ifndef GO_FASTER int verbose = ctx->cfg->verbose ; #endif static const char* href[] = { "href", NULL } ; static const char* cite[] = { "cite", NULL } ; static const char* action[] = { "action", NULL } ; static const char* imgattr[] = { "src", "longdesc", "usemap", NULL } ; static const char* inputattr[] = { "src", "usemap", NULL } ; static const char* scriptattr[] = { "src", "for", NULL } ; static const char* frameattr[] = { "src", "longdesc", NULL } ; static const char* objattr[] = { "classid", "codebase", "data", "usemap", NULL } ; static const char* profile[] = { "profile", NULL } ; static const char* background[] = { "background", NULL } ; static const char* codebase[] = { "codebase", NULL } ; static const elt_t linked_elts[] = { { "a" , href } , { "img" , imgattr } , { "form", action } , { "link" , href } , { "script" , scriptattr } , { "base" , href } , { "area" , href } , { "input" , inputattr } , { "frame", frameattr } , { "iframe", frameattr } , { "object", objattr } , { "q" , cite } , { "blockquote" , cite } , { "ins" , cite } , { "del" , cite } , { "head" , profile } , { "body" , background } , { "applet", codebase } , { NULL, NULL } } ; static const char* events[] = { "onclick" , "ondblclick" , "onmousedown" , "onmouseup" , "onmouseover" , "onmousemove" , "onmouseout" , "onkeypress" , "onkeydown" , "onkeyup" , "onfocus" , "onblur" , "onload" , "onunload" , "onsubmit" , "onreset" , "onselect" , "onchange" , NULL } ; ap_fputc(ctx->f->next, ctx->bb, '<') ; ap_fputs(ctx->f->next, ctx->bb, name) ; if ( attrs ) { linkattrs = 0 ; for ( elt = linked_elts; elt->name != NULL ; ++elt ) if ( !strcmp(elt->name, name) ) { linkattrs = elt->attrs ; break ; } for ( a = attrs ; *a ; a += 2 ) { ctx->offset = 0 ; if ( a[1] ) { pappend(ctx, a[1], strlen(a[1])+1) ; is_uri = ATTR_IGNORE ; if ( linkattrs ) { for ( linkattr = linkattrs ; *linkattr ; ++linkattr) { if ( !strcmp(*linkattr, *a) ) { is_uri = ATTR_URI ; break ; } } } if ( (is_uri == ATTR_IGNORE) && ctx->cfg->extfix ) { for ( linkattr = events; *linkattr; ++linkattr ) { if ( !strcmp(*linkattr, *a) ) { is_uri = ATTR_EVENT ; break ; } } } switch ( is_uri ) { case ATTR_URI: num_match = 0 ; for ( m = ctx->cfg->map ; m ; m = m->next ) { if ( ! ( m->flags & M_HTML ) ) continue ; if ( m->flags & M_REGEX ) { nmatch = 10 ; if ( ! ap_regexec(m->from.r, ctx->buf, nmatch, pmatch, 0) ) { ++num_match ; offs = match = pmatch[0].rm_so ; s_from = pmatch[0].rm_eo - match ; subs = ap_pregsub(ctx->f->r->pool, m->to, ctx->buf+offs, nmatch, pmatch) ; VERBOSE( { const char* f = apr_pstrndup(ctx->f->r->pool, ctx->buf + offs , s_from ) ; ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, ctx->f->r, "H/RX: match at %s, substituting %s", f, subs) ; } ) s_to = strlen(subs) ; len = strlen(ctx->buf) ; if ( s_to > s_from) { preserve(ctx, s_to - s_from) ; memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from, len + 1 - s_from - offs) ; memcpy(ctx->buf+offs, subs, s_to) ; } else { memcpy(ctx->buf + offs, subs, s_to) ; memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from, len + 1 - s_from - offs) ; } } } else { s_from = strlen(m->from.c) ; if ( ! strncasecmp(ctx->buf, m->from.c, s_from ) ) { ++num_match ; s_to = strlen(m->to) ; len = strlen(ctx->buf) ; VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, ctx->f->r, "H: matched %s, substituting %s", m->from.c, m->to) ) ; if ( s_to > s_from ) { preserve(ctx, s_to - s_from) ; memmove(ctx->buf+s_to, ctx->buf+s_from, len + 1 - s_from ) ; memcpy(ctx->buf, m->to, s_to) ; } else { /* it fits in the existing space */ memcpy(ctx->buf, m->to, s_to) ; memmove(ctx->buf+s_to, ctx->buf+s_from, len + 1 - s_from) ; } break ; } } if ( num_match > 0 ) /* URIs only want one match */ break ; } break ; case ATTR_EVENT: for ( m = ctx->cfg->map ; m ; m = m->next ) { num_match = 0 ; /* reset here since we're working per-rule */ if ( ! ( m->flags & M_EVENTS ) ) continue ; if ( m->flags & M_REGEX ) { nmatch = 10 ; offs = 0 ; while ( ! ap_regexec(m->from.r, ctx->buf+offs, nmatch, pmatch, 0) ) { match = pmatch[0].rm_so ; s_from = pmatch[0].rm_eo - match ; subs = ap_pregsub(ctx->f->r->pool, m->to, ctx->buf+offs, nmatch, pmatch) ; VERBOSE( { const char* f = apr_pstrndup(ctx->f->r->pool, ctx->buf + offs , s_from ) ; ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, ctx->f->r, "E/RX: match at %s, substituting %s", f, subs) ; } ) s_to = strlen(subs) ; offs += match ; len = strlen(ctx->buf) ; if ( s_to > s_from) { preserve(ctx, s_to - s_from) ; memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from, len + 1 - s_from - offs) ; memcpy(ctx->buf+offs, subs, s_to) ; } else { memcpy(ctx->buf + offs, subs, s_to) ; memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from, len + 1 - s_from - offs) ; } offs += s_to ; ++num_match ; } } else { found = strstr(ctx->buf, m->from.c) ; if ( (m->flags & M_ATSTART) && ( found != ctx->buf) ) continue ; while ( found ) { s_from = strlen(m->from.c) ; s_to = strlen(m->to) ; match = found - ctx->buf ; if ( ( s_from < strlen(found) ) && (m->flags & M_ATEND ) ) { found = strstr(ctx->buf+match+s_from, m->from.c) ; continue ; } else { found = strstr(ctx->buf+match+s_to, m->from.c) ; } VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, ctx->f->r, "E: matched %s, substituting %s", m->from.c, m->to) ) ; len = strlen(ctx->buf) ; if ( s_to > s_from ) { preserve(ctx, s_to - s_from) ; memmove(ctx->buf+match+s_to, ctx->buf+match+s_from, len + 1 - s_from - match) ; memcpy(ctx->buf+match, m->to, s_to) ; } else { memcpy(ctx->buf+match, m->to, s_to) ; memmove(ctx->buf+match+s_to, ctx->buf+match+s_from, len + 1 - s_from - match) ; } ++num_match ; } } if ( num_match && ( m->flags & M_LAST ) ) break ; } break ; case ATTR_IGNORE: break ; } } if ( ! a[1] ) ap_fputstrs(ctx->f->next, ctx->bb, " ", a[0], NULL) ; else { if ( ctx->cfg->flags != 0 ) normalise(ctx->cfg->flags, ctx->buf) ; /* write the attribute, using pcharacters to html-escape anything that needs it in the value. */ ap_fputstrs(ctx->f->next, ctx->bb, " ", a[0], "=\"", NULL) ; pcharacters(ctx, ctx->buf, strlen(ctx->buf)) ; ap_fputc(ctx->f->next, ctx->bb, '"') ; } } } ctx->offset = 0 ; if ( is_empty_elt(name) ) ap_fputs(ctx->f->next, ctx->bb, ctx->cfg->etag) ; else ap_fputc(ctx->f->next, ctx->bb, '>') ; } static htmlSAXHandlerPtr setupSAX(apr_pool_t* pool) { htmlSAXHandlerPtr sax = apr_pcalloc(pool, sizeof(htmlSAXHandler) ) ; sax->startDocument = NULL ; sax->endDocument = NULL ; sax->startElement = pstartElement ; sax->endElement = pendElement ; sax->characters = pcharacters ; sax->comment = pcomment ; sax->cdataBlock = pcdata ; return sax ; } static ap_regex_t* seek_meta_ctype ; static ap_regex_t* seek_charset ; static ap_regex_t* seek_meta ; static void proxy_html_child_init(apr_pool_t* pool, server_rec* s) { seek_meta_ctype = ap_pregcomp(pool, "(<meta[^>]*http-equiv[ \t\r\n='\"]*content-type[^>]*>)", AP_REG_EXTENDED|AP_REG_ICASE) ; seek_charset = ap_pregcomp(pool, "charset=([A-Za-z0-9_-]+)", AP_REG_EXTENDED|AP_REG_ICASE) ; seek_meta = ap_pregcomp(pool, "<meta[^>]*(http-equiv)[^>]*>", AP_REG_EXTENDED|AP_REG_ICASE) ; } static xmlCharEncoding sniff_encoding( request_rec* r, const char* cbuf, size_t bytes, saxctxt *ctxt #ifndef GO_FASTER , int verbose #endif ) { xmlCharEncoding ret ; char* encoding = NULL ; char* p ; ap_regmatch_t match[2] ; unsigned char* buf = (unsigned char*)cbuf ; VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r, "Content-Type is %s", r->content_type) ) ; /* If we've got it in the HTTP headers, there's nothing to do */ if ( r->content_type && ( p = ap_strcasestr(r->content_type, "charset=") , p > 0 ) ) { p += 8 ; if ( encoding = apr_pstrndup(r->pool, p, strcspn(p, " ;") ) , encoding ) { if ( ret = xmlParseCharEncoding(encoding), ret != XML_CHAR_ENCODING_ERROR ) { VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r, "Got charset %s from HTTP headers", encoding) ) ; return ret ; } else { ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r, "Unsupported charset %s in HTTP headers, falling back to iconv", encoding) ; ctxt->enc_from=encoding; return 0 ; } } } /* to sniff, first we look for BOM */ if ( ret = xmlDetectCharEncoding(buf, bytes), ret != XML_CHAR_ENCODING_NONE ) { VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r, "Got charset from XML rules.") ) ; return ret ; } /* If none of the above, look for a META-thingey */ encoding = NULL ; if ( ap_regexec(seek_meta_ctype, buf, 1, match, 0) == 0 ) { p = apr_pstrndup(r->pool, buf + match[0].rm_so, match[0].rm_eo - match[0].rm_so) ; if ( ap_regexec(seek_charset, p, 2, match, 0) == 0 ) encoding = apr_pstrndup(r->pool, p+match[1].rm_so, match[1].rm_eo - match[1].rm_so) ; } /* either it's set to something we found or it's still the default */ if ( encoding ) { if ( ret = xmlParseCharEncoding(encoding), ret != XML_CHAR_ENCODING_ERROR ) { VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r, "Got charset %s from HTML META", encoding) ) ; return ret ; } else { ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r, "Unsupported charset %s in HTML META, falling back to iconv", encoding) ; ctxt->enc_from=encoding; return 0; } } /* the old HTTP default is a last resort */ ap_log_rerror(APLOG_MARK, APLOG_WARNING, 0, r, "No usable charset information: falling back to windows-1251") ; ctxt->enc_from="windows-1251"; return 0 ; } static meta* metafix(request_rec* r, const char* buf /*, size_t bytes*/ #ifndef GO_FASTER , int verbose #endif ) { meta* ret = NULL ; size_t offs = 0 ; const char* p ; const char* q ; char* header ; char* content ; ap_regmatch_t pmatch[2] ; char delim ; while ( ! ap_regexec(seek_meta, buf+offs, 2, pmatch, 0) ) { header = NULL ; content = NULL ; p = buf+offs+pmatch[1].rm_eo ; while ( !isalpha(*++p) ) ; for ( q = p ; isalnum(*q) || (*q == '-') ; ++q ) ; header = apr_pstrndup(r->pool, p, q-p) ; if ( strncasecmp(header, "Content-", 8) ) { /* find content=... string */ for ( p = ap_strstr((char*)buf+offs+pmatch[0].rm_so, "content") ; *p ; ) { p += 7 ; while ( *p && isspace(*p) ) ++p ; if ( *p != '=' ) continue ; while ( *p && isspace(*++p) ) ; if ( ( *p == '\'' ) || ( *p == '"' ) ) { delim = *p++ ; for ( q = p ; *q != delim ; ++q ) ; } else { for ( q = p ; *q && !isspace(*q) && (*q != '>') ; ++q ) ; } content = apr_pstrndup(r->pool, p, q-p) ; break ; } } else if ( !strncasecmp(header, "Content-Type", 12) ) { ret = apr_palloc(r->pool, sizeof(meta) ) ; ret->start = pmatch[0].rm_so ; ret->end = pmatch[0].rm_eo ; } if ( header && content ) { VERBOSE( ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r, "Adding header [%s: %s] from HTML META", header, content) ) ; apr_table_setn(r->headers_out, header, content) ; } offs += pmatch[0].rm_eo ; } return ret ; } static int proxy_html_filter_init(ap_filter_t* f) { const char* env ; saxctxt* fctx ; #if 0 /* remove content-length filter */ ap_filter_rec_t* clf = ap_get_output_filter_handle("CONTENT_LENGTH") ; ap_filter_t* ff = f->next ; do { ap_filter_t* fnext = ff->next ; if ( ff->frec == clf ) ap_remove_output_filter(ff) ; ff = fnext ; } while ( ff ) ; #endif fctx = f->ctx = apr_pcalloc(f->r->pool, sizeof(saxctxt)) ; fctx->sax = setupSAX(f->r->pool) ; fctx->f = f ; fctx->bb = apr_brigade_create(f->r->pool, f->r->connection->bucket_alloc) ; fctx->cfg = ap_get_module_config(f->r->per_dir_config,&proxy_html_module); if ( f->r->proto_num >= 1001 ) { if ( ! f->r->main && ! f->r->prev ) { env = apr_table_get(f->r->subprocess_env, "force-response-1.0") ; if ( !env ) f->r->chunked = 1 ; } } apr_table_unset(f->r->headers_out, "Content-Length") ; apr_table_unset(f->r->headers_out, "ETag") ; return OK ; } static saxctxt* check_filter_init (ap_filter_t* f) { const char* errmsg = NULL ; if ( ! f->r->proxyreq ) { errmsg = "Non-proxy request; not inserting proxy-html filter" ; } else if ( ! f->r->content_type ) { errmsg = "No content-type; bailing out of proxy-html filter" ; } else if ( strncasecmp(f->r->content_type, "text/html", 9) && strncasecmp(f->r->content_type, "application/xhtml+xml", 21) ) { errmsg = "Non-HTML content; not inserting proxy-html filter" ; } if ( errmsg ) { #ifndef GO_FASTER proxy_html_conf* cfg = ap_get_module_config(f->r->per_dir_config, &proxy_html_module); if ( cfg->verbose ) { ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, f->r, errmsg) ; } #endif ap_remove_output_filter(f) ; return NULL ; } if ( ! f->ctx ) proxy_html_filter_init(f) ; return f->ctx ; } size_t ConvertCtxtBuffer (const char *buf, const char **newbuf, size_t bytes, saxctxt * ctxt, ap_filter_t * f,const char* newencoding) { if (!newencoding) newencoding=ctxt->enc_from; size_t len = 0; if (newencoding) { if (!xmlFindCharEncodingHandler (newencoding)) { ap_log_rerror (APLOG_MARK, APLOG_ERR, 0, f->r, "ConvertInput: no encoding handler found for '%s'", newencoding); *newbuf = buf; return bytes; } else { ap_log_rerror (APLOG_MARK, APLOG_INFO, 0, f->r, "ConvertInput: bytes: %d, ", bytes); len = ConvertInput (buf, newbuf, bytes, f->r, newencoding); ap_log_rerror (APLOG_MARK, APLOG_INFO, 0, f->r, "ConvertInput: New Output Size (bytes): %d, ", len); if (len < 0) { ap_log_rerror (APLOG_MARK, APLOG_INFO, 0, f->r, "ConvertInput: conversion failed from '%s'", newencoding); *newbuf = buf; return bytes; } buf = *newbuf; ap_log_rerror (APLOG_MARK, APLOG_INFO, 0, f->r, "ConvertInput: encoding handler found for '%s', conversion suceeded", newencoding); return len; } } else { *newbuf = buf; return bytes; } } static int proxy_html_filter (ap_filter_t * f, apr_bucket_brigade * bb) { apr_bucket *b; meta *m = NULL; xmlCharEncoding enc; const char *buf = 0; apr_size_t bytes = 0; #ifndef USE_OLD_LIBXML2 int xmlopts = XML_PARSE_RECOVER | XML_PARSE_NONET | XML_PARSE_NOBLANKS | XML_PARSE_NOERROR | XML_PARSE_NOWARNING; #endif saxctxt *ctxt = check_filter_init (f); if (!ctxt) return ap_pass_brigade (f->next, bb); for (b = APR_BRIGADE_FIRST (bb); b != APR_BRIGADE_SENTINEL (bb); b = APR_BUCKET_NEXT (b)) { if (APR_BUCKET_IS_EOS (b)) { if (ctxt->parser != NULL) { htmlParseChunk (ctxt->parser, buf, 0, 1); } APR_BRIGADE_INSERT_TAIL (ctxt->bb, apr_bucket_eos_create (ctxt->bb-> bucket_alloc)); ap_pass_brigade (ctxt->f->next, ctxt->bb); } else if (!APR_BUCKET_IS_METADATA (b) && apr_bucket_read (b, &buf, &bytes, APR_BLOCK_READ) == APR_SUCCESS) { if (ctxt->parser == NULL) { if (buf && buf[bytes] != 0) { /* make a string for parse routines to play with */ char *buf1 = apr_palloc (f->r->pool, bytes + 1); memcpy (buf1, buf, bytes); buf1[bytes] = 0; buf = buf1; } #ifndef GO_FASTER enc = sniff_encoding (f->r, buf, bytes, ctxt, ctxt->cfg->verbose); if (enc == 0) { enc = XML_CHAR_ENCODING_UTF8; } if (ctxt->cfg->metafix) m = metafix (f->r, buf, ctxt->cfg->verbose); #else enc = sniff_encoding (f->r, buf, bytes, ctxt); if (enc == 0) { enc = XML_CHAR_ENCODING_UTF8; } if (ctxt->cfg->metafix) m = metafix (f->r, buf); #endif ap_set_content_type (f->r, "text/html;charset=utf-8"); ap_fputs (f->next, ctxt->bb, ctxt->cfg->doctype); bytes = ConvertCtxtBuffer (buf, &buf, bytes, ctxt, f, 0); if (m) { ctxt->parser = htmlCreatePushParserCtxt (ctxt->sax, ctxt, buf, m->start, 0, enc); htmlParseChunk (ctxt->parser, buf + m->end, bytes - m->end, 0); } else { ctxt->parser = htmlCreatePushParserCtxt (ctxt->sax, ctxt, buf, bytes, 0, enc); } apr_pool_cleanup_register (f->r->pool, ctxt->parser, (void *) htmlFreeParserCtxt, apr_pool_cleanup_null); #ifndef USE_OLD_LIBXML2 if (xmlopts = xmlCtxtUseOptions (ctxt->parser, xmlopts), xmlopts) ap_log_rerror (APLOG_MARK, APLOG_WARNING, 0, f->r, "Unsupported parser opts %x", xmlopts); #endif } else { bytes = ConvertCtxtBuffer (buf, &buf, bytes, ctxt, f,0); htmlParseChunk (ctxt->parser, buf, bytes, 0); } } else { ap_log_rerror (APLOG_MARK, APLOG_ERR, 0, f->r, "Error in bucket read"); } } /*ap_fflush(ctxt->f->next, ctxt->bb) ; // uncomment for debug */ apr_brigade_cleanup (bb); return APR_SUCCESS; } static const char* fpi_html = "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01//EN\">\n" ; static const char* fpi_html_legacy = "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">\n" ; static const char* fpi_xhtml = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n" ; static const char* fpi_xhtml_legacy = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" ; static const char* html_etag = ">" ; static const char* xhtml_etag = " />" ; /*#define DEFAULT_DOCTYPE fpi_html */ static const char* DEFAULT_DOCTYPE = "" ; #define DEFAULT_ETAG html_etag static void* proxy_html_config(apr_pool_t* pool, char* x) { proxy_html_conf* ret = apr_pcalloc(pool, sizeof(proxy_html_conf) ) ; ret->doctype = DEFAULT_DOCTYPE ; ret->etag = DEFAULT_ETAG ; ret->bufsz = 8192 ; return ret ; } static void* proxy_html_merge(apr_pool_t* pool, void* BASE, void* ADD) { proxy_html_conf* base = (proxy_html_conf*) BASE ; proxy_html_conf* add = (proxy_html_conf*) ADD ; proxy_html_conf* conf = apr_palloc(pool, sizeof(proxy_html_conf)) ; if ( add->map && base->map ) { urlmap* a ; conf->map = NULL ; for ( a = base->map ; a ; a = a->next ) { urlmap* save = conf->map ; conf->map = apr_pmemdup(pool, a, sizeof(urlmap)) ; conf->map->next = save ; } for ( a = add->map ; a ; a = a->next ) { urlmap* save = conf->map ; conf->map = apr_pmemdup(pool, a, sizeof(urlmap)) ; conf->map->next = save ; } } else conf->map = add->map ? add->map : base->map ; conf->doctype = ( add->doctype == DEFAULT_DOCTYPE ) ? base->doctype : add->doctype ; conf->etag = ( add->etag == DEFAULT_ETAG ) ? base->etag : add->etag ; conf->bufsz = add->bufsz ; if ( add->flags & NORM_RESET ) { conf->flags = add->flags ^ NORM_RESET ; conf->metafix = add->metafix ; conf->extfix = add->extfix ; conf->strip_comments = add->strip_comments ; #ifndef GO_FASTER conf->verbose = add->verbose ; #endif } else { conf->flags = base->flags | add->flags ; conf->metafix = base->metafix | add->metafix ; conf->extfix = base->extfix | add->extfix ; conf->strip_comments = base->strip_comments | add->strip_comments ; #ifndef GO_FASTER conf->verbose = base->verbose | add->verbose ; #endif } return conf ; } #define REGFLAG(n,s,c) ( (s&&(ap_strchr((char*)(s),(c))!=NULL)) ? (n) : 0 ) #define XREGFLAG(n,s,c) ( (!s||(ap_strchr((char*)(s),(c))==NULL)) ? (n) : 0 ) static const char* set_urlmap(cmd_parms* cmd, void* CFG, const char* from, const char* to, const char* flags) { int regflags ; proxy_html_conf* cfg = (proxy_html_conf*)CFG ; urlmap* map ; urlmap* newmap = apr_palloc(cmd->pool, sizeof(urlmap) ) ; newmap->next = NULL ; newmap->flags = XREGFLAG(M_HTML,flags,'h') | XREGFLAG(M_EVENTS,flags,'e') | XREGFLAG(M_CDATA,flags,'c') | REGFLAG(M_ATSTART,flags,'^') | REGFLAG(M_ATEND,flags,'$') | REGFLAG(M_REGEX,flags,'R') | REGFLAG(M_LAST,flags,'L') ; if ( cfg->map ) { for ( map = cfg->map ; map->next ; map = map->next ) ; map->next = newmap ; } else cfg->map = newmap ; if ( ! (newmap->flags & M_REGEX) ) { newmap->from.c = apr_pstrdup(cmd->pool, from) ; newmap->to = apr_pstrdup(cmd->pool, to) ; } else { regflags = REGFLAG(AP_REG_EXTENDED,flags,'x') | REGFLAG(AP_REG_ICASE,flags,'i') | REGFLAG(AP_REG_NOSUB,flags,'n') | REGFLAG(AP_REG_NEWLINE,flags,'s') ; newmap->from.r = ap_pregcomp(cmd->pool, from, regflags) ; newmap->to = apr_pstrdup(cmd->pool, to) ; } return NULL ; } static const char* set_doctype(cmd_parms* cmd, void* CFG, const char* t, const char* l) { proxy_html_conf* cfg = (proxy_html_conf*)CFG ; if ( !strcasecmp(t, "xhtml") ) { cfg->etag = xhtml_etag ; if ( l && !strcasecmp(l, "legacy") ) cfg->doctype = fpi_xhtml_legacy ; else cfg->doctype = fpi_xhtml ; } else if ( !strcasecmp(t, "html") ) { cfg->etag = html_etag ; if ( l && !strcasecmp(l, "legacy") ) cfg->doctype = fpi_html_legacy ; else cfg->doctype = fpi_html ; } else { cfg->doctype = apr_pstrdup(cmd->pool, t) ; if ( l && ( ( l[0] == 'x' ) || ( l[0] == 'X' ) ) ) cfg->etag = xhtml_etag ; else cfg->etag = html_etag ; } return NULL ; } static void set_param(proxy_html_conf* cfg, const char* arg) { if ( arg && *arg ) { if ( !strcmp(arg, "lowercase") ) cfg->flags |= NORM_LC ; else if ( !strcmp(arg, "dospath") ) cfg->flags |= NORM_MSSLASH ; else if ( !strcmp(arg, "reset") ) cfg->flags |= NORM_RESET ; } } static const char* set_flags(cmd_parms* cmd, void* CFG, const char* arg1, const char* arg2, const char* arg3) { set_param( (proxy_html_conf*)CFG, arg1) ; set_param( (proxy_html_conf*)CFG, arg2) ; set_param( (proxy_html_conf*)CFG, arg3) ; return NULL ; } static const command_rec proxy_html_cmds[] = { AP_INIT_TAKE23("ProxyHTMLURLMap", set_urlmap, NULL, RSRC_CONF|ACCESS_CONF, "Map URL From To" ) , AP_INIT_TAKE12("ProxyHTMLDoctype", set_doctype, NULL, RSRC_CONF|ACCESS_CONF, "(HTML|XHTML) [Legacy]" ) , AP_INIT_TAKE123("ProxyHTMLFixups", set_flags, NULL, RSRC_CONF|ACCESS_CONF, "Options are lowercase, dospath" ) , AP_INIT_FLAG("ProxyHTMLMeta", ap_set_flag_slot, (void*)APR_OFFSETOF(proxy_html_conf, metafix), RSRC_CONF|ACCESS_CONF, "Fix META http-equiv elements" ) , AP_INIT_FLAG("ProxyHTMLExtended", ap_set_flag_slot, (void*)APR_OFFSETOF(proxy_html_conf, extfix), RSRC_CONF|ACCESS_CONF, "Map URLs in Javascript and CSS" ) , AP_INIT_FLAG("ProxyHTMLStripComments", ap_set_flag_slot, (void*)APR_OFFSETOF(proxy_html_conf, strip_comments), RSRC_CONF|ACCESS_CONF, "Strip out comments" ) , #ifndef GO_FASTER AP_INIT_FLAG("ProxyHTMLLogVerbose", ap_set_flag_slot, (void*)APR_OFFSETOF(proxy_html_conf, verbose), RSRC_CONF|ACCESS_CONF, "Verbose Logging (use with LogLevel Info)" ) , #endif AP_INIT_TAKE1("ProxyHTMLBufSize", ap_set_int_slot, (void*)APR_OFFSETOF(proxy_html_conf, bufsz), RSRC_CONF|ACCESS_CONF, "Buffer size" ) , { NULL } } ; static int mod_proxy_html(apr_pool_t* p, apr_pool_t* p1, apr_pool_t* p2, server_rec* s) { ap_add_version_component(p, VERSION_STRING) ; return OK ; } static void proxy_html_hooks(apr_pool_t* p) { ap_register_output_filter("proxy-html", proxy_html_filter, NULL, AP_FTYPE_RESOURCE) ; ap_hook_post_config(mod_proxy_html, NULL, NULL, APR_HOOK_MIDDLE) ; ap_hook_child_init(proxy_html_child_init, NULL, NULL, APR_HOOK_MIDDLE) ; } module AP_MODULE_DECLARE_DATA proxy_html_module = { STANDARD20_MODULE_STUFF, proxy_html_config, proxy_html_merge, NULL, NULL, proxy_html_cmds, proxy_html_hooks } ;
--------------------------------------------------------------------- The official User-To-User support forum of the Apache HTTP Server Project. See <URL:http://httpd.apache.org/userslist.html> for more info. To unsubscribe, e-mail: users-unsubscribe@xxxxxxxxxxxxxxxx " from the digest: users-digest-unsubscribe@xxxxxxxxxxxxxxxx For additional commands, e-mail: users-help@xxxxxxxxxxxxxxxx