diff --git a/mime.c b/mime.c index 45de80a..ecac85d 100644 --- a/mime.c +++ b/mime.c @@ -1109,16 +1109,34 @@ fromhdr_end: } /* + * return length of this UTF-8 codepoint in bytes + */ +static size_t +codepointsize(char tc) +{ + int rv = 0; + if ( ! ( tc & 0x80 ) ) + return 1; + while ( tc & 0x80 ) + { + rv++; + tc = tc<<1; + } + return rv; +} + +/* * Convert header fields to RFC 1522 format and write to the file fo. */ static size_t mime_write_tohdr(struct str *in, FILE *fo) { char *upper, *wbeg, *wend, *charset, *lastwordend = NULL, *lastspc, b, - *charset7; + *charset7, *cp; struct str cin, cout; - size_t sz = 0, col = 0, wr, charsetlen, charset7len; + size_t sz = 0, col = 0, wr, charsetlen, charset7len, cpsz; int quoteany, mustquote, broken, + maxin, maxout, curin, cps, maxcol = 65 /* there is the header field's name, too */; upper = in->s + in->l; @@ -1134,41 +1152,75 @@ mime_write_tohdr(struct str *in, FILE *fo) if (mustquote_hdr(wbeg, wbeg == in->s, wbeg == &upper[-1])) quoteany++; } + + /* + * rfc2047 says we cannot split multi-byte characters over + * encoded words, so we need to know if we're a multi-byte + * source stream (UTF-8 specifically) or just an 8 bit + * stream like ISO-8859-15 + * so test beginning of charset since it is valid to include + * language in charset "UTF-8*DE" etc as per rfc 2184/2231 + */ + char *thisset = b&0200 ? charset : charset7; + int is_utf8 = ( strncasecmp( thisset, "utf-8", 5 ) == 0 ); + if (2 * quoteany > in->l) { /* * Print the entire field in base64. */ - for (wbeg = in->s; wbeg < upper; wbeg = wend) { + for (wbeg = in->s; wbeg < upper; ) { wend = upper; cin.s = wbeg; - for (;;) { - cin.l = wend - wbeg; - if (cin.l * 4/3 + 7 + charsetlen - < maxcol - col) { - fprintf(fo, "=?%s?B?", - b&0200 ? charset : charset7); - wr = mime_write_tob64(&cin, fo, 1); - fwrite("?=", sizeof (char), 2, fo); - wr += 7 + charsetlen; - sz += wr, col += wr; - if (wend < upper) { - fwrite("\n ", sizeof (char), - 2, fo); - sz += 2; - col = 0; - maxcol = 76; + /* + * we calculate the maximum number of bytes + * we can use on this output line, and then what + * this equates to as base64 encoded source bytes + */ + maxout = maxcol - col - 7 - charsetlen; + maxin = (maxout - (maxout & 0x03)) * 3/4; + + /* short enough to finish ? */ + if (maxin > upper - wbeg ) + { + curin = upper - wbeg; + wbeg += curin; + }else + { + if (is_utf8) + { + /* + * now scan the input from the beginning + * to see how many codepoints will fit + */ + curin = 0; + while (curin < maxin + && (cpsz = codepointsize(*wbeg)) <= (maxin - curin)) + { + curin += cpsz; + wbeg += cpsz; } - break; - } else { - if (col) { - fprintf(fo, "\n "); - sz += 2; - col = 0; - maxcol = 76; - } else - wend -= 4; + }else + { + curin = maxin; + wbeg += maxin; } } + cin.l = curin; + fprintf(fo, "%s=?%s?B?", (cin.s != in->s) ? " " : "", thisset ); + wr = mime_write_tob64(&cin, fo, 1); + + if (wbeg < upper) + { + wr += fwrite("?=\n ", sizeof (char), 4, fo) * sizeof (char); + }else + { + wr += fwrite("?=", sizeof (char), 2, fo) * sizeof (char); + } + + /* and shuffle pointers and counts */ + col = 1; + maxcol = 76; + sz += wr + 7 + charsetlen + ((cin.s != in->s) ? 1 : 0 ); } } else { /* @@ -1243,7 +1295,29 @@ mime_write_tohdr(struct str *in, FILE *fo) maxcol -= wbeg - lastspc; } else { - wend -= 4; + if (is_utf8) + { + /* + * make sure wend is not pointing to + * the middle of a codepoint + */ + cp = wend; + while (--cp > wbeg) + { + cps = codepointsize(*cp); + if (cps > 1) + { + if (wend - cp - cps > 4) + wend -= 4; + else + wend = cp; + break; + } + } + if (cp == wbeg) + wend -= 4; + } else + wend -= 4; } free(cout.s); }