/*$Id: unfoldHDR.c,v 1.14 1999/11/06 05:25:14 lindberg Exp $*/
/*$Name: ezmlm-idx-040 $*/

#include "stralloc.h"
#include "strerr.h"
#include "case.h"
#include "byte.h"
#include "errtxt.h"
#include "mime.h"

static stralloc tmpdata = {0};

static int trimre(cpp,cpend,prefix,fatal)
char **cpp;
char *cpend;
stralloc *prefix;
char *fatal;

{
  int r = 0;
  register char *cp;
  char *cpnew;
  int junk;
  unsigned int i,j;
  unsigned int serial;

  cp = *cpp;
  serial = prefix->len;		/* pointer to serial number */
  if (serial)
    serial = byte_rchr(prefix->s,prefix->len,'#');

  junk = 1;
  while (junk) {
    junk = 0;
    while (cp <= cpend && (*cp == ' ' || *cp == '\t')) cp++;
    cpnew = cp;
    while (++cpnew <= cpend) {	/* /(..+:\s)/ is a reply indicator */
      if (*cpnew == ' ') {
        if (cpnew < cp + 3) break;	/* at least 3 char before ' ' */
	if (*(cpnew - 1) != ':') break;	/* require ':' before ' ' */
	if (cpnew > cp + 5) {		/* if > 4 char before ':' require */
	  register char ch;
	  ch = *(cpnew - 2);		/* XX^3, XX[3], XX(3) */
	  if (ch != ')' && ch != ']' && (ch < '0' || ch > '9'))
	    break;
	}
	junk = 1;
	r |= 1;
	cp = cpnew + 1;
        break;
      }
    }
	/* prefix removal is complicated by the inconsistent handling of ' ' */
	/* when there are rfc2047-encoded words in the subject. We first     */
	/* compare prefix before "serial" ignoring space, then skip the      */
	/* number, then compare after "serial". If both matched we've found  */
	/* the prefix. */
    if (serial) {
      cpnew = cp;
      i = 0;
      while (i < serial && cpnew <= cpend) {
        if (*cpnew != ' ') {
          if (prefix->s[i] == ' ') {
            ++i;
            continue;
          }
          if (*cpnew != prefix->s[i]) break;
          ++i;
        }
        ++cpnew;
      }
      if (i == serial) {		/* match before serial */
        j = prefix->len;
        if (serial != j) {		/* got a '#' */
          while (cpnew <= cpend &&	/* skip number/space */
		*cpnew == ' ' || (*cpnew <= '9' && *cpnew >= '0')) ++cpnew;
          i = serial + 1;
          while (i < j && cpnew <= cpend) {
            if (*cpnew != ' ') {
              if (prefix->s[i] == ' ') {
                ++i;
                continue;
              }
              if (*cpnew != prefix->s[i]) break;
              ++i;
            }
            ++cpnew;
          }
        }
        if (i == j) {
          cp = cpnew;
          junk = 1;
          r |= 2;
        }
      }
    }
  }
  *cpp = cp;
  return r;
}

static int trimend(indata,np,fatal)
char *indata;
unsigned int *np;
char *fatal;
	/* looks at indata of length n from the end removing LWSP & '\n' */
	/* and any trailing '-Reply'. Sets n to new length and returns:  */
	/* 0 - not reply, 1 - reply. */
{
  char *cplast;
  int junk;
  int r = 0;

  if (*np == 0) return 0;
  cplast = indata + *np - 1;	/* points to last char on line */
  junk = 1;
  while (junk) {
    junk = 0;
    while (cplast >= indata &&
             (*cplast == ' ' || *cplast == '\t' ||
              *cplast == '\r' || *cplast == '\n')) 
            --cplast;
    if (cplast - indata  >= 5 && case_startb(cplast - 5,6,"-Reply")) {
      cplast -= 6;
      r = 1;
      junk = 1;
    }
  }
  *np = (unsigned int) (cplast - indata + 1);	/* new length */
  return r;
}

int unfoldHDR(indata,n,outdata,charset,prefix,flagtrimsub,fatal)
char *indata;
unsigned int n;
stralloc *outdata;
char *charset;
stralloc *prefix;
int flagtrimsub;
char *fatal;
	/* takes a header as indata. Removal of reply-indicators is done */
	/* but removal of line breaks and Q and B decoding should have   */
	/* been done. Returns a */
	/* single line header without trailing \n or \0. Mainly, we      */
	/* remove redundant shift codes   */
	/* returns 0 = no reply no prefix */
	/*         1 = reply no prefix    */
	/*         2 = no reply, prefix   */
	/*         3 = reply & pefix      */
{
  int r = 0;
  char *cp,*cpesc,*cpnext,*cpend,*cpout;
  char state,cset,newcset;
  int reg,newreg;

  cp = indata;		/* JIS X 0201 -> ISO646 us-ascii */
  cpend = cp + n - 1;
  cpnext = cp;
  if (!stralloc_copys(&tmpdata,"")) die_nomem(fatal);
  if (!stralloc_ready(&tmpdata,n)) die_nomem(fatal);

  if(!case_diffb(charset,11,"iso-2022-jp")) {
	/* iso-2022-jp-2 (rfc1554) and its subset iso-2022-jp. The reg #s */
	/* are from the rfc. Don't ask why they have multiple length G0   */
	/* charset designations ... JIS X 0201-roman is identical to      */
	/* iso646 us-ascii except for currency and tilde. Making them the */
	/* same increases hits without significant loss. JIS X 0208-1978  */
	/* is superceded by JIS X 0208-1983 and converted here as well.   */

    while (cp < cpend) {
      if (*cp++ != ESC) continue;
      if (*cp == '(') {
        if (++cp > cpend) break;
        if (*cp == 'J') *cp = 'B';
        ++cp;
      } else if (*cp == '$') {
        if (++cp > cpend) break;
        if (*cp == '@') *cp = 'B';
        ++cp;
      }
    }
		/* eliminate redundant ESC seqs */
    cp = indata;
    cpnext = cp;
    reg = 6;
    while (cp < cpend) {
      if (*cp++ != ESC) continue;
      cpesc = cp - 1;
      if (*cp == '$') {
        if (++cp > cpend) break;
        if (*cp == 'B') newreg = 87;
        else if (*cp == 'A') newreg = 58;
        else if (*cp == '(') {
          if (++cp > cpend) break;
          if (*cp == 'C') newreg = 149;
          else if (*cp == 'D') newreg = 159;
          else continue;
        } else continue;
      } else if (*cp == '(') {
        if (++cp > cpend) break;
        if (*cp == 'B') newreg = 6;
        else continue;
      } else continue;
      if (++cp > cpend) break;
      while (*cp == ' ' || *cp == '\t')
        if (++cp >= cpend) break;	/* skip space */
      if (*cp == ESC)			/* maybe another G0 designation */
        if (*(cp+1) == '(' || *(cp+1) == '$') {	 /* yep! */
          if (!stralloc_catb(&tmpdata,cpnext,cpesc-cpnext)) die_nomem(fatal);
          cpnext = cp;
	  continue;
      }
      if (reg == newreg) {
        if (!stralloc_catb(&tmpdata,cpnext,cpesc-cpnext)) die_nomem(fatal);
        cpnext = cp;
      } else {
        reg = newreg;
      }		/* copy remainder of line */
    }
    if (!stralloc_catb(&tmpdata,cpnext,cpend - cpnext + 1)) die_nomem(fatal);
    if (reg != 6) {	/* need to return to us-ascii at the end of the line */
      if (!stralloc_cats(&tmpdata,TOASCII)) die_nomem(fatal);
    } else {		/* maybe "-Reply at the end?" */
      r = trimend(tmpdata.s,&(tmpdata.len),fatal);
    }

  } else if (!case_diffb(charset,11,"iso-2022-cn") ||
             !case_diffb(charset,11,"iso-2022-kr")) {
	/* these use SI/SO and ESC $ ) x as the SO designation. In -cn and */
	/* -cn-ext, 'x' can be a number of different letters. In -kr it's  */
	/* always 'C'. This routine may work also for other iso-2022 sets  */
	/* also handles iso-2022-cn-ext */
    cpesc = (char *) 0;	/* points to latest ESC */
    state = SI;		/* us-ascii */
    --cp;		/* set up for loop */

    while (++cp <= cpend) {
      if (*cp == SI || *cp == SO) {
        if (state == *cp) {		 /* already in state. Skip shift seq */
          if (!stralloc_catb(&tmpdata,cpnext,cp-cpnext-1)) die_nomem(fatal);
          cpnext = cp;
        } else				/* set new state */
          state = *cp;
        if (++cp > cpend) break;
        continue;
      }
      if (*cp != ESC) continue;
      if (cp + 3 > cpend) break;	/* not space for full SO-designation */
      cpesc = cp;
      if (*cp != '$') continue;
      if (++cp > cpend) break;
      if (*cp != ')') continue;
      if (++cp > cpend) break;
      newcset = *cp;
      if (++cp > cpend) break;
      while (cp <= cpend && (*cp == ' ' || *cp == '\t')) ++cp;
      if (cp + 3 > cpend) break;	/* no space for full SO-designation */
      if ((*cp == ESC && *(cp+1) == '$' && *(cp+2) == ')')
		|| (newcset == cset)) {
			/* skip if a second SO-designation right after or */
			/* this SO-designation is already active, skip */
        if (!stralloc_catb(&tmpdata,cpnext,cpesc-cpnext)) die_nomem(fatal);
        --cp;		/* "unpeek" so that next iteration will see char */
        cpnext = cpesc + 4;
        continue;
      } else {
        cset = newcset;
        continue;
      }
    }
			/* get remainder of line */
    if (!stralloc_catb(&tmpdata,cpnext,cpend - cpnext + 1)) die_nomem(fatal);
    if (state != SI)	/* need to end in ascii */
      if (!stralloc_cats(&tmpdata,TOSI)) die_nomem(fatal);
    else		/* ascii end; maybe "-Reply" at the end? */
      r = trimend(tmpdata.s,&(tmpdata.len),fatal);

  } else {		/* other character sets = no special treatment */
    r = trimend(cp,&n,fatal);		/* -reply */
    if (!stralloc_copyb(&tmpdata,cp,n)) die_nomem(fatal);
  }

  cp = tmpdata.s;
  n = tmpdata.len;
  cpend = cp + n - 1;
  if (flagtrimsub) {	 /* remove leading reply indicators & prefix*/
    r |= trimre(&cp,cpend,prefix,fatal);
    n = (unsigned int) (cpend-cp+1);
  }
			/* there shouldn't be '\0' or '\n', but make sure as */
			/* it would break the message index */
  if (!stralloc_copys(outdata,"")) die_nomem(fatal);
  if (!stralloc_ready(outdata,n)) die_nomem(fatal);
  outdata->len = n;
  cpout = outdata->s;
  while (n--) {		/* '\n' and '\0' would break the subject index */
    if (!*cp || *cp == '\n') *cpout = ' ';
    else *cpout = *cp;
    ++cp; ++cpout;
  }
  return r;
}