From bdde82de7756dd598fa6e77f663ac9b2e4e9bca6 Mon Sep 17 00:00:00 2001 From: AlmaLinux RelEng Bot Date: Tue, 10 Mar 2026 15:55:58 -0400 Subject: [PATCH] import UBI postgresql-13.23-2.el9_7 --- ...6-2004--CVE-2026-2005--CVE-2026-2006.patch | 6357 +++++++++++++++++ SPECS/postgresql.spec | 7 +- 2 files changed, 6363 insertions(+), 1 deletion(-) create mode 100644 SOURCES/CVE-2026-2004--CVE-2026-2005--CVE-2026-2006.patch diff --git a/SOURCES/CVE-2026-2004--CVE-2026-2005--CVE-2026-2006.patch b/SOURCES/CVE-2026-2004--CVE-2026-2005--CVE-2026-2006.patch new file mode 100644 index 0000000..ce70126 --- /dev/null +++ b/SOURCES/CVE-2026-2004--CVE-2026-2005--CVE-2026-2006.patch @@ -0,0 +1,6357 @@ +diff --git a/contrib/btree_gist/btree_utils_var.c b/contrib/btree_gist/btree_utils_var.c +index 2886c08b85e..9d93b3c775e 100644 +--- a/contrib/btree_gist/btree_utils_var.c ++++ b/contrib/btree_gist/btree_utils_var.c +@@ -116,36 +116,47 @@ gbt_var_leaf2node(GBT_VARKEY *leaf, const gbtree_vinfo *tinfo, FmgrInfo *flinfo) + + /* + * returns the common prefix length of a node key ++ * ++ * If the underlying type is character data, the prefix length may point in ++ * the middle of a multibyte character. + */ + static int32 + gbt_var_node_cp_len(const GBT_VARKEY *node, const gbtree_vinfo *tinfo) + { + GBT_VARKEY_R r = gbt_var_key_readable(node); + int32 i = 0; +- int32 l = 0; ++ int32 l_left_to_match = 0; ++ int32 l_total = 0; + int32 t1len = VARSIZE(r.lower) - VARHDRSZ; + int32 t2len = VARSIZE(r.upper) - VARHDRSZ; + int32 ml = Min(t1len, t2len); + char *p1 = VARDATA(r.lower); + char *p2 = VARDATA(r.upper); ++ const char *end1 = p1 + t1len; ++ const char *end2 = p2 + t2len; + + if (ml == 0) + return 0; + + while (i < ml) + { +- if (tinfo->eml > 1 && l == 0) ++ if (tinfo->eml > 1 && l_left_to_match == 0) + { +- if ((l = pg_mblen(p1)) != pg_mblen(p2)) ++ l_total = pg_mblen_range(p1, end1); ++ if (l_total != pg_mblen_range(p2, end2)) + { + return i; + } ++ l_left_to_match = l_total; + } + if (*p1 != *p2) + { + if (tinfo->eml > 1) + { +- return (i - l + 1); ++ int32 l_matched_subset = l_total - l_left_to_match; ++ ++ /* end common prefix at final byte of last matching char */ ++ return i - l_matched_subset; + } + else + { +@@ -155,7 +166,7 @@ gbt_var_node_cp_len(const GBT_VARKEY *node, const gbtree_vinfo *tinfo) + + p1++; + p2++; +- l--; ++ l_left_to_match--; + i++; + } + return ml; /* lower == upper */ +diff --git a/contrib/dict_xsyn/dict_xsyn.c b/contrib/dict_xsyn/dict_xsyn.c +index 1065d64ccb0..9bca5a21b8d 100644 +--- a/contrib/dict_xsyn/dict_xsyn.c ++++ b/contrib/dict_xsyn/dict_xsyn.c +@@ -48,15 +48,15 @@ find_word(char *in, char **end) + char *start; + + *end = NULL; +- while (*in && t_isspace(in)) +- in += pg_mblen(in); ++ while (*in && t_isspace_cstr(in)) ++ in += pg_mblen_cstr(in); + + if (!*in || *in == '#') + return NULL; + start = in; + +- while (*in && !t_isspace(in)) +- in += pg_mblen(in); ++ while (*in && !t_isspace_cstr(in)) ++ in += pg_mblen_cstr(in); + + *end = in; + +diff --git a/contrib/hstore/hstore_io.c b/contrib/hstore/hstore_io.c +index f369c68187b..0b1e0581e84 100644 +--- a/contrib/hstore/hstore_io.c ++++ b/contrib/hstore/hstore_io.c +@@ -81,7 +81,9 @@ get_val(HSParser *state, bool ignoreeq, bool *escaped) + } + else if (*(state->ptr) == '=' && !ignoreeq) + { +- elog(ERROR, "Syntax error near '%c' at position %d", *(state->ptr), (int32) (state->ptr - state->begin)); ++ elog(ERROR, "Syntax error near \"%.*s\" at position %d", ++ pg_mblen_cstr(state->ptr), state->ptr, ++ (int32) (state->ptr - state->begin)); + } + else if (*(state->ptr) == '\\') + { +@@ -220,7 +222,9 @@ parse_hstore(HSParser *state) + } + else if (!scanner_isspace((unsigned char) *(state->ptr))) + { +- elog(ERROR, "Syntax error near '%c' at position %d", *(state->ptr), (int32) (state->ptr - state->begin)); ++ elog(ERROR, "Syntax error near \"%.*s\" at position %d", ++ pg_mblen_cstr(state->ptr), state->ptr, ++ (int32) (state->ptr - state->begin)); + } + } + else if (st == WGT) +@@ -235,7 +239,9 @@ parse_hstore(HSParser *state) + } + else + { +- elog(ERROR, "Syntax error near '%c' at position %d", *(state->ptr), (int32) (state->ptr - state->begin)); ++ elog(ERROR, "Syntax error near \"%.*s\" at position %d", ++ pg_mblen_cstr(state->ptr), state->ptr, ++ (int32) (state->ptr - state->begin)); + } + } + else if (st == WVAL) +@@ -268,7 +274,9 @@ parse_hstore(HSParser *state) + } + else if (!scanner_isspace((unsigned char) *(state->ptr))) + { +- elog(ERROR, "Syntax error near '%c' at position %d", *(state->ptr), (int32) (state->ptr - state->begin)); ++ elog(ERROR, "Syntax error near \"%.*s\" at position %d", ++ pg_mblen_cstr(state->ptr), state->ptr, ++ (int32) (state->ptr - state->begin)); + } + } + else +diff --git a/contrib/intarray/_int_selfuncs.c b/contrib/intarray/_int_selfuncs.c +index bcb785b15b2..66ba874391d 100644 +--- a/contrib/intarray/_int_selfuncs.c ++++ b/contrib/intarray/_int_selfuncs.c +@@ -19,6 +19,7 @@ + #include "catalog/pg_operator.h" + #include "catalog/pg_statistic.h" + #include "catalog/pg_type.h" ++#include "commands/extension.h" + #include "miscadmin.h" + #include "utils/builtins.h" + #include "utils/lsyscache.h" +@@ -171,7 +172,18 @@ _int_matchsel(PG_FUNCTION_ARGS) + PG_RETURN_FLOAT8(0.0); + } + +- /* The caller made sure the const is a query, so get it now */ ++ /* ++ * Verify that the Const is a query_int, else return a default estimate. ++ * (This could only fail if someone attached this estimator to the wrong ++ * operator.) ++ */ ++ if (((Const *) other)->consttype != ++ get_function_sibling_type(fcinfo->flinfo->fn_oid, "query_int")) ++ { ++ ReleaseVariableStats(vardata); ++ PG_RETURN_FLOAT8(DEFAULT_EQ_SEL); ++ } ++ + query = DatumGetQueryTypeP(((Const *) other)->constvalue); + + /* Empty query matches nothing */ +diff --git a/contrib/ltree/lquery_op.c b/contrib/ltree/lquery_op.c +index ef86046fc4b..abccbb8927f 100644 +--- a/contrib/ltree/lquery_op.c ++++ b/contrib/ltree/lquery_op.c +@@ -26,14 +26,14 @@ getlexeme(char *start, char *end, int *len) + char *ptr; + int charlen; + +- while (start < end && (charlen = pg_mblen(start)) == 1 && t_iseq(start, '_')) ++ while (start < end && (charlen = pg_mblen_range(start, end)) == 1 && t_iseq(start, '_')) + start += charlen; + + ptr = start; + if (ptr >= end) + return NULL; + +- while (ptr < end && !((charlen = pg_mblen(ptr)) == 1 && t_iseq(ptr, '_'))) ++ while (ptr < end && !((charlen = pg_mblen_range(ptr, end)) == 1 && t_iseq(ptr, '_'))) + ptr += charlen; + + *len = ptr - start; +diff --git a/contrib/ltree/ltree.h b/contrib/ltree/ltree.h +index 83fc705ef86..852fc9f5f5e 100644 +--- a/contrib/ltree/ltree.h ++++ b/contrib/ltree/ltree.h +@@ -113,7 +113,8 @@ typedef struct + + #define LQUERY_HASNOT 0x01 + +-#define ISALNUM(x) ( t_isalpha(x) || t_isdigit(x) || ( pg_mblen(x) == 1 && t_iseq((x), '_') ) ) ++/* Caller has already called mblen, so we can use _unbounded variants safely. */ ++#define ISALNUM(x) ( t_isalpha_unbounded(x) || t_isdigit_unbounded(x) || ( pg_mblen_unbounded(x) == 1 && t_iseq((x), '_') ) ) + + /* full text query */ + +diff --git a/contrib/ltree/ltree_io.c b/contrib/ltree/ltree_io.c +index 15115cb29f3..0a44a8c4691 100644 +--- a/contrib/ltree/ltree_io.c ++++ b/contrib/ltree/ltree_io.c +@@ -54,7 +54,7 @@ parse_ltree(const char *buf) + ptr = buf; + while (*ptr) + { +- charlen = pg_mblen(ptr); ++ charlen = pg_mblen_cstr(ptr); + if (t_iseq(ptr, '.')) + num++; + ptr += charlen; +@@ -69,7 +69,7 @@ parse_ltree(const char *buf) + ptr = buf; + while (*ptr) + { +- charlen = pg_mblen(ptr); ++ charlen = pg_mblen_cstr(ptr); + + switch (state) + { +@@ -285,7 +285,7 @@ parse_lquery(const char *buf) + ptr = buf; + while (*ptr) + { +- charlen = pg_mblen(ptr); ++ charlen = pg_mblen_cstr(ptr); + + if (t_iseq(ptr, '.')) + num++; +@@ -305,7 +305,7 @@ parse_lquery(const char *buf) + ptr = buf; + while (*ptr) + { +- charlen = pg_mblen(ptr); ++ charlen = pg_mblen_cstr(ptr); + + switch (state) + { +@@ -402,7 +402,7 @@ parse_lquery(const char *buf) + case LQPRS_WAITFNUM: + if (t_iseq(ptr, ',')) + state = LQPRS_WAITSNUM; +- else if (t_isdigit(ptr)) ++ else if (t_isdigit_cstr(ptr)) + { + int low = atoi(ptr); + +@@ -420,7 +420,7 @@ parse_lquery(const char *buf) + UNCHAR; + break; + case LQPRS_WAITSNUM: +- if (t_isdigit(ptr)) ++ if (t_isdigit_cstr(ptr)) + { + int high = atoi(ptr); + +@@ -451,7 +451,7 @@ parse_lquery(const char *buf) + case LQPRS_WAITCLOSE: + if (t_iseq(ptr, '}')) + state = LQPRS_WAITEND; +- else if (!t_isdigit(ptr)) ++ else if (!t_isdigit_cstr(ptr)) + UNCHAR; + break; + case LQPRS_WAITND: +@@ -462,7 +462,7 @@ parse_lquery(const char *buf) + } + else if (t_iseq(ptr, ',')) + state = LQPRS_WAITSNUM; +- else if (!t_isdigit(ptr)) ++ else if (!t_isdigit_cstr(ptr)) + UNCHAR; + break; + case LQPRS_WAITEND: +diff --git a/contrib/ltree/ltxtquery_io.c b/contrib/ltree/ltxtquery_io.c +index d967f92110f..7f98bdedecb 100644 +--- a/contrib/ltree/ltxtquery_io.c ++++ b/contrib/ltree/ltxtquery_io.c +@@ -59,7 +59,7 @@ gettoken_query(QPRS_STATE *state, int32 *val, int32 *lenval, char **strval, uint + + for (;;) + { +- charlen = pg_mblen(state->buf); ++ charlen = pg_mblen_cstr(state->buf); + + switch (state->state) + { +@@ -83,7 +83,7 @@ gettoken_query(QPRS_STATE *state, int32 *val, int32 *lenval, char **strval, uint + *lenval = charlen; + *flag = 0; + } +- else if (!t_isspace(state->buf)) ++ else if (!t_isspace_unbounded(state->buf)) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("operand syntax error"))); +diff --git a/contrib/pageinspect/heapfuncs.c b/contrib/pageinspect/heapfuncs.c +index 1e9df8dfcf6..189bcfa9cac 100644 +--- a/contrib/pageinspect/heapfuncs.c ++++ b/contrib/pageinspect/heapfuncs.c +@@ -26,6 +26,7 @@ + #include "postgres.h" + + #include "access/htup_details.h" ++#include "mb/pg_wchar.h" + #include "access/relation.h" + #include "catalog/pg_am_d.h" + #include "catalog/pg_type.h" +@@ -99,7 +100,8 @@ text_to_bits(char *str, int len) + else + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), +- errmsg("illegal character '%c' in t_bits string", str[off]))); ++ errmsg("invalid character \"%.*s\" in t_bits string", ++ pg_mblen_cstr(str + off), str + off))); + + if (off % 8 == 7) + bits[off / 8] = byte; +diff --git a/contrib/pg_trgm/Makefile b/contrib/pg_trgm/Makefile +index d75e9ada2e4..970380adfb0 100644 +--- a/contrib/pg_trgm/Makefile ++++ b/contrib/pg_trgm/Makefile +@@ -14,7 +14,7 @@ DATA = pg_trgm--1.4--1.5.sql pg_trgm--1.3--1.4.sql \ + pg_trgm--1.0--1.1.sql + PGFILEDESC = "pg_trgm - trigram matching" + +-REGRESS = pg_trgm pg_word_trgm pg_strict_word_trgm ++REGRESS = pg_trgm pg_utf8_trgm pg_word_trgm pg_strict_word_trgm + + ifdef USE_PGXS + PG_CONFIG = pg_config +diff --git a/contrib/pg_trgm/data/trgm_utf8.data b/contrib/pg_trgm/data/trgm_utf8.data +new file mode 100644 +index 00000000000..713856e76a6 +--- /dev/null ++++ b/contrib/pg_trgm/data/trgm_utf8.data +@@ -0,0 +1,50 @@ ++Mathematics ++数学 ++गणित ++Matemáticas ++رياضيات ++Mathématiques ++গণিত ++Matemática ++Математика ++ریاضی ++Matematika ++Mathematik ++数学 ++Mathematics ++गणित ++గణితం ++Matematik ++கணிதம் ++數學 ++Toán học ++Matematika ++数学 ++수학 ++ریاضی ++Lissafi ++Hisabati ++Matematika ++Matematica ++ریاضی ++ಗಣಿತ ++ગણિત ++คณิตศาสตร์ ++ሂሳብ ++गणित ++ਗਣਿਤ ++數學 ++数学 ++Iṣiro ++數學 ++သင်္ချာ ++Herrega ++رياضي ++गणित ++Математика ++Matematyka ++ഗണിതം ++Matematika ++رياضي ++Matematika ++Matematică +diff --git a/contrib/pg_trgm/expected/pg_utf8_trgm.out b/contrib/pg_trgm/expected/pg_utf8_trgm.out +new file mode 100644 +index 00000000000..0768e7d6a83 +--- /dev/null ++++ b/contrib/pg_trgm/expected/pg_utf8_trgm.out +@@ -0,0 +1,8 @@ ++SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset ++\if :skip_test ++\quit ++\endif ++-- Index 50 translations of the word "Mathematics" ++CREATE TEMP TABLE mb (s text); ++\copy mb from 'data/trgm_utf8.data' ++CREATE INDEX ON mb USING gist(s gist_trgm_ops); +diff --git a/contrib/pg_trgm/expected/pg_utf8_trgm_1.out b/contrib/pg_trgm/expected/pg_utf8_trgm_1.out +new file mode 100644 +index 00000000000..8505c4fa552 +--- /dev/null ++++ b/contrib/pg_trgm/expected/pg_utf8_trgm_1.out +@@ -0,0 +1,3 @@ ++SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset ++\if :skip_test ++\quit +diff --git a/contrib/pg_trgm/sql/pg_utf8_trgm.sql b/contrib/pg_trgm/sql/pg_utf8_trgm.sql +new file mode 100644 +index 00000000000..0dd962ced83 +--- /dev/null ++++ b/contrib/pg_trgm/sql/pg_utf8_trgm.sql +@@ -0,0 +1,9 @@ ++SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset ++\if :skip_test ++\quit ++\endif ++ ++-- Index 50 translations of the word "Mathematics" ++CREATE TEMP TABLE mb (s text); ++\copy mb from 'data/trgm_utf8.data' ++CREATE INDEX ON mb USING gist(s gist_trgm_ops); +diff --git a/contrib/pg_trgm/trgm.h b/contrib/pg_trgm/trgm.h +index b616953462e..76ee4bbdc68 100644 +--- a/contrib/pg_trgm/trgm.h ++++ b/contrib/pg_trgm/trgm.h +@@ -51,10 +51,10 @@ typedef char trgm[3]; + } while(0) + + #ifdef KEEPONLYALNUM +-#define ISWORDCHR(c) (t_isalpha(c) || t_isdigit(c)) ++#define ISWORDCHR(c, len) (t_isalpha_with_len(c, len) || t_isdigit_with_len(c, len)) + #define ISPRINTABLECHAR(a) ( isascii( *(unsigned char*)(a) ) && (isalnum( *(unsigned char*)(a) ) || *(unsigned char*)(a)==' ') ) + #else +-#define ISWORDCHR(c) (!t_isspace(c)) ++#define ISWORDCHR(c, len) (!t_isspace_with_len(c, len)) + #define ISPRINTABLECHAR(a) ( isascii( *(unsigned char*)(a) ) && isprint( *(unsigned char*)(a) ) ) + #endif + #define ISPRINTABLETRGM(t) ( ISPRINTABLECHAR( ((char*)(t)) ) && ISPRINTABLECHAR( ((char*)(t))+1 ) && ISPRINTABLECHAR( ((char*)(t))+2 ) ) +diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c +index fb38135f7a3..63895c3017d 100644 +--- a/contrib/pg_trgm/trgm_op.c ++++ b/contrib/pg_trgm/trgm_op.c +@@ -171,18 +171,29 @@ static char * + find_word(char *str, int lenstr, char **endword, int *charlen) + { + char *beginword = str; ++ const char *endstr = str + lenstr; + +- while (beginword - str < lenstr && !ISWORDCHR(beginword)) +- beginword += pg_mblen(beginword); ++ while (beginword < endstr) ++ { ++ int clen = pg_mblen_range(beginword, endstr); + +- if (beginword - str >= lenstr) ++ if (ISWORDCHR(beginword, clen)) ++ break; ++ beginword += clen; ++ } ++ ++ if (beginword >= endstr) + return NULL; + + *endword = beginword; + *charlen = 0; +- while (*endword - str < lenstr && ISWORDCHR(*endword)) ++ while (*endword < endstr) + { +- *endword += pg_mblen(*endword); ++ int clen = pg_mblen_range(*endword, endstr); ++ ++ if (!ISWORDCHR(*endword, clen)) ++ break; ++ *endword += clen; + (*charlen)++; + } + +@@ -230,9 +241,9 @@ make_trigrams(trgm *tptr, char *str, int bytelen, int charlen) + if (bytelen > charlen) + { + /* Find multibyte character boundaries and apply compact_trigram */ +- int lenfirst = pg_mblen(str), +- lenmiddle = pg_mblen(str + lenfirst), +- lenlast = pg_mblen(str + lenfirst + lenmiddle); ++ int lenfirst = pg_mblen_unbounded(str), ++ lenmiddle = pg_mblen_unbounded(str + lenfirst), ++ lenlast = pg_mblen_unbounded(str + lenfirst + lenmiddle); + + while ((ptr - str) + lenfirst + lenmiddle + lenlast <= bytelen) + { +@@ -243,7 +254,7 @@ make_trigrams(trgm *tptr, char *str, int bytelen, int charlen) + + lenfirst = lenmiddle; + lenmiddle = lenlast; +- lenlast = pg_mblen(ptr + lenfirst + lenmiddle); ++ lenlast = pg_mblen_unbounded(ptr + lenfirst + lenmiddle); + } + } + else +@@ -723,6 +734,7 @@ get_wildcard_part(const char *str, int lenstr, + { + const char *beginword = str; + const char *endword; ++ const char *endstr = str + lenstr; + char *s = buf; + bool in_leading_wildcard_meta = false; + bool in_trailing_wildcard_meta = false; +@@ -735,11 +747,13 @@ get_wildcard_part(const char *str, int lenstr, + * from this loop to the next one, since we may exit at a word character + * that is in_escape. + */ +- while (beginword - str < lenstr) ++ while (beginword < endstr) + { ++ clen = pg_mblen_range(beginword, endstr); ++ + if (in_escape) + { +- if (ISWORDCHR(beginword)) ++ if (ISWORDCHR(beginword, clen)) + break; + in_escape = false; + in_leading_wildcard_meta = false; +@@ -750,12 +764,12 @@ get_wildcard_part(const char *str, int lenstr, + in_escape = true; + else if (ISWILDCARDCHAR(beginword)) + in_leading_wildcard_meta = true; +- else if (ISWORDCHR(beginword)) ++ else if (ISWORDCHR(beginword, clen)) + break; + else + in_leading_wildcard_meta = false; + } +- beginword += pg_mblen(beginword); ++ beginword += clen; + } + + /* +@@ -788,12 +802,12 @@ get_wildcard_part(const char *str, int lenstr, + * string boundary. Strip escapes during copy. + */ + endword = beginword; +- while (endword - str < lenstr) ++ while (endword < endstr) + { +- clen = pg_mblen(endword); ++ clen = pg_mblen_range(endword, endstr); + if (in_escape) + { +- if (ISWORDCHR(endword)) ++ if (ISWORDCHR(endword, clen)) + { + memcpy(s, endword, clen); + (*charlen)++; +@@ -821,7 +835,7 @@ get_wildcard_part(const char *str, int lenstr, + in_trailing_wildcard_meta = true; + break; + } +- else if (ISWORDCHR(endword)) ++ else if (ISWORDCHR(endword, clen)) + { + memcpy(s, endword, clen); + (*charlen)++; +diff --git a/contrib/pg_trgm/trgm_regexp.c b/contrib/pg_trgm/trgm_regexp.c +index 31192209065..7e235c66946 100644 +--- a/contrib/pg_trgm/trgm_regexp.c ++++ b/contrib/pg_trgm/trgm_regexp.c +@@ -480,7 +480,7 @@ static TRGM *createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph, + static void RE_compile(regex_t *regex, text *text_re, + int cflags, Oid collation); + static void getColorInfo(regex_t *regex, TrgmNFA *trgmNFA); +-static bool convertPgWchar(pg_wchar c, trgm_mb_char *result); ++static int convertPgWchar(pg_wchar c, trgm_mb_char *result); + static void transformGraph(TrgmNFA *trgmNFA); + static void processState(TrgmNFA *trgmNFA, TrgmState *state); + static void addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key); +@@ -815,10 +815,11 @@ getColorInfo(regex_t *regex, TrgmNFA *trgmNFA) + for (j = 0; j < charsCount; j++) + { + trgm_mb_char c; ++ int clen = convertPgWchar(chars[j], &c); + +- if (!convertPgWchar(chars[j], &c)) ++ if (!clen) + continue; /* ok to ignore it altogether */ +- if (ISWORDCHR(c.bytes)) ++ if (ISWORDCHR(c.bytes, clen)) + colorInfo->wordChars[colorInfo->wordCharsCount++] = c; + else + colorInfo->containsNonWord = true; +@@ -830,13 +831,15 @@ getColorInfo(regex_t *regex, TrgmNFA *trgmNFA) + + /* + * Convert pg_wchar to multibyte format. +- * Returns false if the character should be ignored completely. ++ * Returns 0 if the character should be ignored completely, else returns its ++ * byte length. + */ +-static bool ++static int + convertPgWchar(pg_wchar c, trgm_mb_char *result) + { + /* "s" has enough space for a multibyte character and a trailing NUL */ + char s[MAX_MULTIBYTE_CHAR_LEN + 1]; ++ int clen; + + /* + * We can ignore the NUL character, since it can never appear in a PG text +@@ -844,11 +847,11 @@ convertPgWchar(pg_wchar c, trgm_mb_char *result) + * reconstructing trigrams. + */ + if (c == 0) +- return false; ++ return 0; + + /* Do the conversion, making sure the result is NUL-terminated */ + memset(s, 0, sizeof(s)); +- pg_wchar2mb_with_len(&c, s, 1); ++ clen = pg_wchar2mb_with_len(&c, s, 1); + + /* + * In IGNORECASE mode, we can ignore uppercase characters. We assume that +@@ -870,7 +873,7 @@ convertPgWchar(pg_wchar c, trgm_mb_char *result) + if (strcmp(lowerCased, s) != 0) + { + pfree(lowerCased); +- return false; ++ return 0; + } + pfree(lowerCased); + } +@@ -878,7 +881,7 @@ convertPgWchar(pg_wchar c, trgm_mb_char *result) + + /* Fill result with exactly MAX_MULTIBYTE_CHAR_LEN bytes */ + memcpy(result->bytes, s, MAX_MULTIBYTE_CHAR_LEN); +- return true; ++ return clen; + } + + +diff --git a/contrib/pgcrypto/Makefile b/contrib/pgcrypto/Makefile +--- a/contrib/pgcrypto/Makefile ++++ b/contrib/pgcrypto/Makefile +@@ -53,7 +53,8 @@ + $(CF_TESTS) \ + crypt-md5 \ + pgp-armor pgp-decrypt pgp-encrypt $(CF_PGP_TESTS) \ +- pgp-pubkey-decrypt pgp-pubkey-encrypt pgp-info ++ pgp-pubkey-decrypt pgp-pubkey-encrypt pgp-pubkey-session \ ++ pgp-info + + #REGRESS = init pgp-pubkey-decrypt pgp-decrypt \ + EXTRA_CLEAN = gen-rtab + +diff --git a/contrib/pgcrypto/expected/pgp-decrypt.out b/contrib/pgcrypto/expected/pgp-decrypt.out +index e8250b090ab..a1dd7586f7b 100644 +--- a/contrib/pgcrypto/expected/pgp-decrypt.out ++++ b/contrib/pgcrypto/expected/pgp-decrypt.out +@@ -317,7 +317,7 @@ SaV9L04ky1qECNDx3XjnoKLC+H7IOQ== + (1 row) + + -- expected: da39a3ee5e6b4b0d3255bfef95601890afd80709 +-select encode(digest(pgp_sym_decrypt(dearmor(' ++select encode(digest(pgp_sym_decrypt_bytea(dearmor(' + -----BEGIN PGP MESSAGE----- + Comment: dat3.aes.sha1.mdc.s2k3.z0 + +@@ -393,6 +393,28 @@ ERROR: Wrong key or corrupt data + select pgp_sym_decrypt(pgp_sym_encrypt_bytea('P', 'key'), 'key', 'debug=1'); + NOTICE: dbg: parse_literal_data: data type=b + ERROR: Not text data ++-- NUL byte in text decrypt. Ciphertext source: ++-- printf 'a\x00\xc' | gpg --homedir /nonexistent \ ++-- --personal-compress-preferences uncompressed --textmode \ ++-- --personal-cipher-preferences aes --no-emit-version --batch \ ++-- --symmetric --passphrase key --armor ++do $$ ++begin ++ perform pgp_sym_decrypt(dearmor(' ++-----BEGIN PGP MESSAGE----- ++ ++jA0EBwMCXLc8pozB10Fg0jQBVUID59TLvWutJp0j6eh9ZgjqIRzdYaIymFB8y4XH ++vu0YlJP5D5BX7yqZ+Pry7TlDmiFO ++=rV7z ++-----END PGP MESSAGE----- ++'), 'key', 'debug=1'); ++exception when others then ++ raise '%', ++ regexp_replace(sqlerrm, 'encoding "[^"]*"', 'encoding [REDACTED]'); ++end ++$$; ++ERROR: invalid byte sequence for encoding [REDACTED]: 0x00 ++CONTEXT: PL/pgSQL function inline_code_block line 12 at RAISE + -- Decryption with a certain incorrect key yields an apparent BZip2-compressed + -- plaintext. Ciphertext source: iterative pgp_sym_encrypt('secret', 'key') + -- until the random prefix gave rise to that property. +diff --git a/contrib/pgcrypto/expected/pgp-decrypt_1.out b/contrib/pgcrypto/expected/pgp-decrypt_1.out +index 63d5ab98654..7bcf32ec541 100644 +--- a/contrib/pgcrypto/expected/pgp-decrypt_1.out ++++ b/contrib/pgcrypto/expected/pgp-decrypt_1.out +@@ -313,7 +313,7 @@ SaV9L04ky1qECNDx3XjnoKLC+H7IOQ== + (1 row) + + -- expected: da39a3ee5e6b4b0d3255bfef95601890afd80709 +-select encode(digest(pgp_sym_decrypt(dearmor(' ++select encode(digest(pgp_sym_decrypt_bytea(dearmor(' + -----BEGIN PGP MESSAGE----- + Comment: dat3.aes.sha1.mdc.s2k3.z0 + +@@ -389,6 +389,28 @@ ERROR: Wrong key or corrupt data + select pgp_sym_decrypt(pgp_sym_encrypt_bytea('P', 'key'), 'key', 'debug=1'); + NOTICE: dbg: parse_literal_data: data type=b + ERROR: Not text data ++-- NUL byte in text decrypt. Ciphertext source: ++-- printf 'a\x00\xc' | gpg --homedir /nonexistent \ ++-- --personal-compress-preferences uncompressed --textmode \ ++-- --personal-cipher-preferences aes --no-emit-version --batch \ ++-- --symmetric --passphrase key --armor ++do $$ ++begin ++ perform pgp_sym_decrypt(dearmor(' ++-----BEGIN PGP MESSAGE----- ++ ++jA0EBwMCXLc8pozB10Fg0jQBVUID59TLvWutJp0j6eh9ZgjqIRzdYaIymFB8y4XH ++vu0YlJP5D5BX7yqZ+Pry7TlDmiFO ++=rV7z ++-----END PGP MESSAGE----- ++'), 'key', 'debug=1'); ++exception when others then ++ raise '%', ++ regexp_replace(sqlerrm, 'encoding "[^"]*"', 'encoding [REDACTED]'); ++end ++$$; ++ERROR: invalid byte sequence for encoding [REDACTED]: 0x00 ++CONTEXT: PL/pgSQL function inline_code_block line 12 at RAISE + -- Decryption with a certain incorrect key yields an apparent BZip2-compressed + -- plaintext. Ciphertext source: iterative pgp_sym_encrypt('secret', 'key') + -- until the random prefix gave rise to that property. +diff --git a/contrib/pgcrypto/expected/pgp-pubkey-session.out b/contrib/pgcrypto/expected/pgp-pubkey-session.out +new file mode 100644 +index 00000000000..f724d98eb24 +--- /dev/null ++++ b/contrib/pgcrypto/expected/pgp-pubkey-session.out +@@ -0,0 +1,47 @@ ++-- Test for overflow with session key at decrypt. ++-- Data automatically generated by scripts/pgp_session_data.py. ++-- See this file for details explaining how this data is generated. ++SELECT pgp_pub_decrypt_bytea( ++'\xc1c04c030000000000000000020800a46f5b9b1905b49457a6485474f71ed9b46c2527e1 ++da08e1f7871e12c3d38828f2076b984a595bf60f616599ca5729d547de06a258bfbbcd30 ++94a321e4668cd43010f0ca8ecf931e5d39bda1152c50c367b11c723f270729245d3ebdbd ++0694d320c5a5aa6a405fb45182acb3d7973cbce398e0c5060af7603cfd9ed186ebadd616 ++3b50ae42bea5f6d14dda24e6d4687b434c175084515d562e896742b0ba9a1c87d5642e10 ++a5550379c71cc490a052ada483b5d96526c0a600fc51755052aa77fdf72f7b4989b920e7 ++b90f4b30787a46482670d5caecc7a515a926055ad5509d135702ce51a0e4c1033f2d939d ++8f0075ec3428e17310da37d3d2d7ad1ce99adcc91cd446c366c402ae1ee38250343a7fcc ++0f8bc28020e603d7a4795ef0dcc1c04c030000000000000000020800a46f5b9b1905b494 ++57a6485474f71ed9b46c2527e1da08e1f7871e12c3d38828f2076b984a595bf60f616599 ++ca5729d547de06a258bfbbcd3094a321e4668cd43010f0ca8ecf931e5d39bda1152c50c3 ++67b11c723f270729245d3ebdbd0694d320c5a5aa6a405fb45182acb3d7973cbce398e0c5 ++060af7603cfd9ed186ebadd6163b50ae42bea5f6d14dda24e6d4687b434c175084515d56 ++2e896742b0ba9a1c87d5642e10a5550379c71cc490a052ada483b5d96526c0a600fc5175 ++5052aa77fdf72f7b4989b920e7b90f4b30787a46482670d5caecc7a515a926055ad5509d ++135702ce51a0e4c1033f2d939d8f0075ec3428e17310da37d3d2d7ad1ce99adc'::bytea, ++'\xc7c2d8046965d657020800eef8bf1515adb1a3ee7825f75c668ea8dd3e3f9d13e958f6ad ++9c55adc0c931a4bb00abe1d52cf7bb0c95d537949d277a5292ede375c6b2a67a3bf7d19f ++f975bb7e7be35c2d8300dacba360a0163567372f7dc24000cc7cb6170bedc8f3b1f98c12 ++07a6cb4de870a4bc61319b139dcc0e20c368fd68f8fd346d2c0b69c5aed560504e2ec6f1 ++23086fe3c5540dc4dd155c0c67257c4ada862f90fe172ace344089da8135e92aca5c2709 ++f1c1bc521798bb8c0365841496e709bd184132d387e0c9d5f26dc00fd06c3a76ef66a75c ++138285038684707a847b7bd33cfbefbf1d336be954a8048946af97a66352adef8e8b5ae4 ++c4748c6f2510265b7a8267bc370dbb00110100010007ff7e72d4f95d2d39901ac12ca5c5 ++18e767e719e72340c3fab51c8c5ab1c40f31db8eaffe43533fa61e2dbca2c3f4396c0847 ++e5434756acbb1f68128f4136bb135710c89137d74538908dac77967de9e821c559700dd9 ++de5a2727eec1f5d12d5d74869dd1de45ed369d94a8814d23861dd163f8c27744b26b98f0 ++239c2e6dd1e3493b8cc976fdc8f9a5e250f715aa4c3d7d5f237f8ee15d242e8fa941d1a0 ++ed9550ab632d992a97518d142802cb0a97b251319bf5742db8d9d8cbaa06cdfba2d75bc9 ++9d77a51ff20bd5ba7f15d7af6e85b904de2855d19af08d45f39deb85403033c69c767a8e ++74a343b1d6c8911d34ea441ac3850e57808ed3d885835cbe6c79d10400ef16256f3d5c4c ++3341516a2d2aa888df81b603f48a27f3666b40f992a857c1d11ff639cd764a9b42d5a1f8 ++58b4aeee36b85508bb5e8b91ef88a7737770b330224479d9b44eae8c631bc43628b69549 ++507c0a1af0be0dd7696015abea722b571eb35eefc4ab95595378ec12814727443f625fcd ++183bb9b3bccf53b54dd0e5e7a50400ffe08537b2d4e6074e4a1727b658cfccdec8962302 ++25e300c05690de45f7065c3d40d86f544a64d51a3e94424f9851a16d1322ebdb41fa8a45 ++3131f3e2dc94e858e6396722643df382680f815e53bcdcde5da622f50530a83b217f1103 ++cdd6e5e9babe1e415bbff28d44bd18c95f43bbd04afeb2a2a99af38a571c7540de21df03 ++ff62c0a33d9143dd3f639893f47732c11c5a12c6052d1935f4d507b7ae1f76ab0e9a69b8 ++7305a7f7c19bd509daf4903bff614bc26d118f03e461469c72c12d3a2bb4f78e4d342ce8 ++487723649a01ed2b9eb11c662134502c098d55dfcd361939d8370873422c3da75a515a75 ++9ffedfe7df44fb3c20f81650801a30d43b5c90b98b3eee'::bytea); ++ERROR: Public key too big +diff --git a/contrib/pgcrypto/pgp-pgsql.c b/contrib/pgcrypto/pgp-pgsql.c +index 62a2f351e43..ae634a05b72 100644 +--- a/contrib/pgcrypto/pgp-pgsql.c ++++ b/contrib/pgcrypto/pgp-pgsql.c +@@ -643,6 +643,7 @@ pgp_sym_decrypt_text(PG_FUNCTION_ARGS) + arg = PG_GETARG_BYTEA_PP(2); + + res = decrypt_internal(0, 1, data, key, NULL, arg); ++ pg_verifymbstr(VARDATA_ANY(res), VARSIZE_ANY_EXHDR(res), false); + + PG_FREE_IF_COPY(data, 0); + PG_FREE_IF_COPY(key, 1); +@@ -744,6 +745,7 @@ pgp_pub_decrypt_text(PG_FUNCTION_ARGS) + arg = PG_GETARG_BYTEA_PP(3); + + res = decrypt_internal(1, 1, data, key, psw, arg); ++ pg_verifymbstr(VARDATA_ANY(res), VARSIZE_ANY_EXHDR(res), false); + + PG_FREE_IF_COPY(data, 0); + PG_FREE_IF_COPY(key, 1); +diff --git a/contrib/pgcrypto/pgp-pubdec.c b/contrib/pgcrypto/pgp-pubdec.c +index a0a5738a40e..2a13aa3e6ad 100644 +--- a/contrib/pgcrypto/pgp-pubdec.c ++++ b/contrib/pgcrypto/pgp-pubdec.c +@@ -157,6 +157,7 @@ pgp_parse_pubenc_sesskey(PGP_Context *ctx, PullFilter *pkt) + uint8 *msg; + int msglen; + PGP_MPI *m; ++ unsigned sess_key_len; + + pk = ctx->pub_key; + if (pk == NULL) +@@ -220,11 +221,19 @@ pgp_parse_pubenc_sesskey(PGP_Context *ctx, PullFilter *pkt) + if (res < 0) + goto out; + ++ sess_key_len = msglen - 3; ++ if (sess_key_len > PGP_MAX_KEY) ++ { ++ px_debug("incorrect session key length=%u", sess_key_len); ++ res = PXE_PGP_KEY_TOO_BIG; ++ goto out; ++ } ++ + /* + * got sesskey + */ + ctx->cipher_algo = *msg; +- ctx->sess_key_len = msglen - 3; ++ ctx->sess_key_len = sess_key_len; + memcpy(ctx->sess_key, msg + 1, ctx->sess_key_len); + + out: +diff --git a/contrib/pgcrypto/px.c b/contrib/pgcrypto/px.c +index e4fbfd75106..fbee6ad0d68 100644 +--- a/contrib/pgcrypto/px.c ++++ b/contrib/pgcrypto/px.c +@@ -68,6 +68,7 @@ static const struct error_desc px_err_list[] = { + {PXE_PGP_UNEXPECTED_PKT, "Unexpected packet in key data"}, + {PXE_PGP_MATH_FAILED, "Math operation failed"}, + {PXE_PGP_SHORT_ELGAMAL_KEY, "Elgamal keys must be at least 1024 bits long"}, ++ {PXE_PGP_KEY_TOO_BIG, "Public key too big"}, + {PXE_PGP_UNKNOWN_PUBALGO, "Unknown public-key encryption algorithm"}, + {PXE_PGP_WRONG_KEY, "Wrong key"}, + {PXE_PGP_MULTIPLE_KEYS, +diff --git a/contrib/pgcrypto/px.h b/contrib/pgcrypto/px.h +index 0d4722a04a0..2a1725d9969 100644 +--- a/contrib/pgcrypto/px.h ++++ b/contrib/pgcrypto/px.h +@@ -86,7 +86,7 @@ void px_free(void *p); + /* -108 is unused */ + #define PXE_PGP_MATH_FAILED -109 + #define PXE_PGP_SHORT_ELGAMAL_KEY -110 +-/* -111 is unused */ ++#define PXE_PGP_KEY_TOO_BIG -111 + #define PXE_PGP_UNKNOWN_PUBALGO -112 + #define PXE_PGP_WRONG_KEY -113 + #define PXE_PGP_MULTIPLE_KEYS -114 +diff --git a/contrib/pgcrypto/scripts/pgp_session_data.py b/contrib/pgcrypto/scripts/pgp_session_data.py +new file mode 100644 +index 00000000000..999350bb2bc +--- /dev/null ++++ b/contrib/pgcrypto/scripts/pgp_session_data.py +@@ -0,0 +1,491 @@ ++#!/usr/bin/python ++# -*- coding: utf-8 -*- ++# ++# Generate PGP data to check the session key length of the input data provided ++# to pgp_pub_decrypt_bytea(). ++# ++# First, the crafted data is generated from valid RSA data, freshly generated ++# by this script each time it is run, see generate_rsa_keypair(). ++# Second, the crafted PGP data is built, see build_message_data() and ++# build_key_data(). Finally, the resulting SQL script is generated. ++# ++# This script generates in stdout the SQL file that is used in the regression ++# tests of pgcrypto. The following command can be used to regenerate the file ++# which should never be manually manipulated: ++# python3 scripts/pgp_session_data.py > sql/pgp-pubkey-session.sql ++ ++import os ++import re ++import struct ++import secrets ++import sys ++import time ++ ++# pwn for binary manipulation (p32, p64) ++from pwn import * ++ ++# Cryptographic libraries, to craft the PGP data. ++from Crypto.Cipher import AES ++from Crypto.PublicKey import RSA ++from Crypto.Util.number import inverse ++ ++# AES key used for session key encryption (16 bytes for AES-128) ++AES_KEY = b'\x01' * 16 ++ ++def generate_rsa_keypair(key_size: int = 2048) -> dict: ++ """ ++ Generate a fresh RSA key pair. ++ ++ The generated key includes all components needed for PGP operations: ++ - n: public modulus (p * q) ++ - e: public exponent (typically 65537) ++ - d: private exponent (e^-1 mod phi(n)) ++ - p, q: prime factors of n ++ - u: coefficient (p^-1 mod q) for CRT optimization ++ ++ The caller can pass the wanted key size in input, for a default of 2048 ++ bytes. This function returns the RSA key components, after performing ++ some validation on them. ++ """ ++ ++ start_time = time.time() ++ ++ # Generate RSA key ++ key = RSA.generate(key_size) ++ ++ # Extract all key components ++ rsa_components = { ++ 'n': key.n, # Public modulus (p * q) ++ 'e': key.e, # Public exponent (typically 65537) ++ 'd': key.d, # Private exponent (e^-1 mod phi(n)) ++ 'p': key.p, # First prime factor ++ 'q': key.q, # Second prime factor ++ 'u': inverse(key.p, key.q) # Coefficient for CRT: p^-1 mod q ++ } ++ ++ # Validate key components for correctness ++ validate_rsa_key(rsa_components) ++ ++ return rsa_components ++ ++def validate_rsa_key(rsa: dict) -> None: ++ """ ++ Validate a generated RSA key. ++ ++ This function performs basic validation to ensure the RSA key is properly ++ constructed and all components are consistent, at least mathematically. ++ ++ Validations performed: ++ 1. n = p * q (modulus is product of primes) ++ 2. gcd(e, phi(n)) = 1 (public exponent is coprime to phi(n)) ++ 3. (d * e) mod(phi(n)) = 1 (private exponent is multiplicative inverse) ++ 4. (u * p) (mod q) = 1 (coefficient is correct for CRT) ++ """ ++ ++ n, e, d, p, q, u = rsa['n'], rsa['e'], rsa['d'], rsa['p'], rsa['q'], rsa['u'] ++ ++ # Check that n = p * q ++ if n != p * q: ++ raise ValueError("RSA validation failed: n <> p * q") ++ ++ # Check that p and q are different ++ if p == q: ++ raise ValueError("RSA validation failed: p = q (not allowed)") ++ ++ # Calculate phi(n) = (p-1)(q-1) ++ phi_n = (p - 1) * (q - 1) ++ ++ # Check that gcd(e, phi(n)) = 1 ++ def gcd(a, b): ++ while b: ++ a, b = b, a % b ++ return a ++ ++ if gcd(e, phi_n) != 1: ++ raise ValueError("RSA validation failed: gcd(e, phi(n)) <> 1") ++ ++ # Check that (d * e) mod(phi(n)) = 1 ++ if (d * e) % phi_n != 1: ++ raise ValueError("RSA validation failed: d * e <> 1 (mod phi(n))") ++ ++ # Check that (u * p) (mod q) = 1 ++ if (u * p) % q != 1: ++ raise ValueError("RSA validation failed: u * p <> 1 (mod q)") ++ ++def mpi_encode(x: int) -> bytes: ++ """ ++ Encode an integer as an OpenPGP Multi-Precision Integer (MPI). ++ ++ Format (RFC 4880, Section 3.2): ++ - 2 bytes: bit length of the integer (big-endian) ++ - N bytes: the integer in big-endian format ++ ++ This is used to encode RSA key components (n, e, d, p, q, u) in PGP ++ packets. ++ ++ The integer to encode is given in input, returning an MPI-encoded ++ integer. ++ ++ For example: ++ mpi_encode(65537) -> b'\x00\x11\x01\x00\x01' ++ (17 bits, value 0x010001) ++ """ ++ if x < 0: ++ raise ValueError("MPI cannot encode negative integers") ++ ++ if x == 0: ++ # Special case: zero has 0 bits and empty magnitude ++ bits = 0 ++ mag = b"" ++ else: ++ # Calculate bit length and convert to bytes ++ bits = x.bit_length() ++ mag = x.to_bytes((bits + 7) // 8, 'big') ++ ++ # Pack: 2-byte bit length + magnitude bytes ++ return struct.pack('>H', bits) + mag ++ ++def new_packet(tag: int, payload: bytes) -> bytes: ++ """ ++ Create a new OpenPGP packet with a proper header. ++ ++ OpenPGP packet format (RFC 4880, Section 4.2): ++ - New packet format: 0xC0 | tag ++ - Length encoding depends on payload size: ++ * 0-191: single byte ++ * 192-8383: two bytes (192 + ((length - 192) >> 8), (length - 192) & 0xFF) ++ * 8384+: five bytes (0xFF + 4-byte big-endian length) ++ ++ The packet is built from a "tag" (1-63) and some "payload" data. The ++ result generated is a complete OpenPGP packet. ++ ++ For example: ++ new_packet(1, b'data') -> b'\xC1\x04data' ++ (Tag 1, length 4, payload 'data') ++ """ ++ # New packet format: set bit 7 and 6, clear bit 5, tag in bits 0-5 ++ first = 0xC0 | (tag & 0x3F) ++ ln = len(payload) ++ ++ # Encode length according to OpenPGP specification ++ if ln <= 191: ++ # Single byte length for small packets ++ llen = bytes([ln]) ++ elif ln <= 8383: ++ # Two-byte length for medium packets ++ ln2 = ln - 192 ++ llen = bytes([192 + (ln2 >> 8), ln2 & 0xFF]) ++ else: ++ # Five-byte length for large packets ++ llen = bytes([255]) + struct.pack('>I', ln) ++ ++ return bytes([first]) + llen + payload ++ ++def build_key_data(rsa: dict) -> bytes: ++ """ ++ Build the key data, containing an RSA private key. ++ ++ The RSA contents should have been generated previously. ++ ++ Format (see RFC 4880, Section 5.5.3): ++ - 1 byte: version (4) ++ - 4 bytes: creation time (current Unix timestamp) ++ - 1 byte: public key algorithm (2 = RSA encrypt) ++ - MPI: RSA public modulus n ++ - MPI: RSA public exponent e ++ - 1 byte: string-to-key usage (0 = no encryption) ++ - MPI: RSA private exponent d ++ - MPI: RSA prime p ++ - MPI: RSA prime q ++ - MPI: RSA coefficient u = p^-1 mod q ++ - 2 bytes: checksum of private key material ++ ++ This function takes a set of RSA key components in input (n, e, d, p, q, u) ++ and returns a secret key packet. ++ """ ++ ++ # Public key portion ++ ver = bytes([4]) # Version 4 key ++ ctime = struct.pack('>I', int(time.time())) # Current Unix timestamp ++ algo = bytes([2]) # RSA encrypt algorithm ++ n_mpi = mpi_encode(rsa['n']) # Public modulus ++ e_mpi = mpi_encode(rsa['e']) # Public exponent ++ pub = ver + ctime + algo + n_mpi + e_mpi ++ ++ # Private key portion ++ hide_type = bytes([0]) # No string-to-key encryption ++ d_mpi = mpi_encode(rsa['d']) # Private exponent ++ p_mpi = mpi_encode(rsa['p']) # Prime p ++ q_mpi = mpi_encode(rsa['q']) # Prime q ++ u_mpi = mpi_encode(rsa['u']) # Coefficient u = p^-1 mod q ++ ++ # Calculate checksum of private key material (simple sum mod 65536) ++ private_data = d_mpi + p_mpi + q_mpi + u_mpi ++ cksum = sum(private_data) & 0xFFFF ++ ++ secret = hide_type + private_data + struct.pack('>H', cksum) ++ payload = pub + secret ++ ++ return new_packet(7, payload) ++ ++def pgp_cfb_encrypt_resync(key, plaintext): ++ """ ++ Implement OpenPGP CFB mode with resync. ++ ++ OpenPGP CFB mode is a variant of standard CFB with a resync operation ++ after the first two blocks. ++ ++ Algorithm (RFC 4880, Section 13.9): ++ 1. Block 1: FR=zeros, encrypt full block_size bytes ++ 2. Block 2: FR=block1, encrypt only 2 bytes ++ 3. Resync: FR = block1[2:] + block2 ++ 4. Remaining blocks: standard CFB mode ++ ++ This function uses the following arguments: ++ - key: AES encryption key (16 bytes for AES-128) ++ - plaintext: Data to encrypt ++ """ ++ block_size = 16 # AES block size ++ cipher = AES.new(key[:16], AES.MODE_ECB) # Use ECB for manual CFB ++ ciphertext = b'' ++ ++ # Block 1: FR=zeros, encrypt full 16 bytes ++ FR = b'\x00' * block_size ++ FRE = cipher.encrypt(FR) # Encrypt the feedback register ++ block1 = bytes(a ^ b for a, b in zip(FRE, plaintext[0:16])) ++ ciphertext += block1 ++ ++ # Block 2: FR=block1, encrypt only 2 bytes ++ FR = block1 ++ FRE = cipher.encrypt(FR) ++ block2 = bytes(a ^ b for a, b in zip(FRE[0:2], plaintext[16:18])) ++ ciphertext += block2 ++ ++ # Resync: FR = block1[2:16] + block2[0:2] ++ # This is the key difference from standard CFB mode ++ FR = block1[2:] + block2 ++ ++ # Block 3+: Continue with standard CFB mode ++ pos = 18 ++ while pos < len(plaintext): ++ FRE = cipher.encrypt(FR) ++ chunk_len = min(block_size, len(plaintext) - pos) ++ chunk = plaintext[pos:pos+chunk_len] ++ enc_chunk = bytes(a ^ b for a, b in zip(FRE[:chunk_len], chunk)) ++ ciphertext += enc_chunk ++ ++ # Update feedback register for next iteration ++ if chunk_len == block_size: ++ FR = enc_chunk ++ else: ++ # Partial block: pad with old FR bytes ++ FR = enc_chunk + FR[chunk_len:] ++ pos += chunk_len ++ ++ return ciphertext ++ ++def build_literal_data_packet(data: bytes) -> bytes: ++ """ ++ Build a literal data packet containing a message. ++ ++ Format (RFC 4880, Section 5.9): ++ - 1 byte: data format ('b' = binary, 't' = text, 'u' = UTF-8 text) ++ - 1 byte: filename length (0 = no filename) ++ - N bytes: filename (empty in this case) ++ - 4 bytes: date (current Unix timestamp) ++ - M bytes: literal data ++ ++ The data used to build the packet is given in input, with the generated ++ result returned. ++ """ ++ body = bytes([ ++ ord('b'), # Binary data format ++ 0, # Filename length (0 = no filename) ++ ]) + struct.pack('>I', int(time.time())) + data # Current timestamp + data ++ ++ return new_packet(11, body) ++ ++def build_symenc_data_packet(sess_key: bytes, cipher_algo: int, payload: bytes) -> bytes: ++ """ ++ Build a symmetrically-encrypted data packet using AES-128-CFB. ++ ++ This packet contains encrypted data using the session key. The format ++ includes a random prefix, for security (see RFC 4880, Section 5.7). ++ ++ Packet structure: ++ - Random prefix (block_size bytes) ++ - Prefix repeat (last 2 bytes of prefix repeated) ++ - Encrypted literal data packet ++ ++ This function uses the following set of arguments: ++ - sess_key: Session key for encryption ++ - cipher_algo: Cipher algorithm identifier (7 = AES-128) ++ - payload: Data to encrypt (wrapped in literal data packet) ++ """ ++ block_size = 16 # AES-128 block size ++ key = sess_key[:16] # Use first 16 bytes for AES-128 ++ ++ # Create random prefix + repeat last 2 bytes (total 18 bytes) ++ # This is required by OpenPGP for integrity checking ++ prefix_random = secrets.token_bytes(block_size) ++ prefix = prefix_random + prefix_random[-2:] # 18 bytes total ++ ++ # Wrap payload in literal data packet ++ literal_pkt = build_literal_data_packet(payload) ++ ++ # Plaintext = prefix + literal data packet ++ plaintext = prefix + literal_pkt ++ ++ # Encrypt using OpenPGP CFB mode with resync ++ ciphertext = pgp_cfb_encrypt_resync(key, plaintext) ++ ++ return new_packet(9, ciphertext) ++ ++def build_tag1_packet(rsa: dict, sess_key: bytes) -> bytes: ++ """ ++ Build a public-key encrypted key. ++ ++ This is a very important function, as it is able to create the packet ++ triggering the overflow check. This function can also be used to create ++ "legit" packet data. ++ ++ Format (RFC 4880, Section 5.1): ++ - 1 byte: version (3) ++ - 8 bytes: key ID (0 = any key accepted) ++ - 1 byte: public key algorithm (2 = RSA encrypt) ++ - MPI: RSA-encrypted session key ++ ++ This uses in arguments the generated RSA key pair, and the session key ++ to encrypt. The latter is manipulated to trigger the overflow. ++ ++ This function returns a complete packet encrypted by a session key. ++ """ ++ ++ # Calculate RSA modulus size in bytes ++ n_bytes = (rsa['n'].bit_length() + 7) // 8 ++ ++ # Session key message format: ++ # - 1 byte: symmetric cipher algorithm (7 = AES-128) ++ # - N bytes: session key ++ # - 2 bytes: checksum (simple sum of session key bytes) ++ algo_byte = bytes([7]) # AES-128 algorithm identifier ++ cksum = sum(sess_key) & 0xFFFF # 16-bit checksum ++ M = algo_byte + sess_key + struct.pack('>H', cksum) ++ ++ # PKCS#1 v1.5 padding construction ++ # Format: 0x02 || PS || 0x00 || M ++ # Total padded message must be exactly n_bytes long. ++ total_len = n_bytes # Total length must equal modulus size in bytes ++ ps_len = total_len - len(M) - 2 # Subtract 2 for 0x02 and 0x00 bytes ++ ++ if ps_len < 8: ++ raise ValueError(f"Padding string too short ({ps_len} bytes); need at least 8 bytes. " ++ f"Message length: {len(M)}, Modulus size: {n_bytes} bytes") ++ ++ # Create padding string with *ALL* bytes being 0xFF (no zero separator!) ++ PS = bytes([0xFF]) * ps_len ++ ++ # Construct the complete padded message ++ # Normal PKCS#1 v1.5 padding: 0x02 || PS || 0x00 || M ++ padded = bytes([0x02]) + PS + bytes([0x00]) + M ++ ++ # Verify padding construction ++ if len(padded) != n_bytes: ++ raise ValueError(f"Padded message length ({len(padded)}) doesn't match RSA modulus size ({n_bytes})") ++ ++ # Convert padded message to integer and encrypt with RSA ++ m_int = int.from_bytes(padded, 'big') ++ ++ # Ensure message is smaller than modulus (required for RSA) ++ if m_int >= rsa['n']: ++ raise ValueError("Padded message is larger than RSA modulus") ++ ++ # RSA encryption: c = m^e mod n ++ c_int = pow(m_int, rsa['e'], rsa['n']) ++ ++ # Encode encrypted result as MPI ++ c_mpi = mpi_encode(c_int) ++ ++ # Build complete packet ++ ver = bytes([3]) # Version 3 packet ++ key_id = b"\x00" * 8 # Key ID (0 = any key accepted) ++ algo = bytes([2]) # RSA encrypt algorithm ++ payload = ver + key_id + algo + c_mpi ++ ++ return new_packet(1, payload) ++ ++def build_message_data(rsa: dict) -> bytes: ++ """ ++ This function creates a crafted message, with a long session key ++ length. ++ ++ This takes in input the RSA key components generated previously, ++ returning a concatenated set of PGP packets crafted for the purpose ++ of this test. ++ """ ++ ++ # Base prefix for session key (AES key + padding + size). ++ # Note that the crafted size is the important part for this test. ++ prefix = AES_KEY + b"\x00" * 16 + p32(0x10) ++ ++ # Build encrypted data packet, legit. ++ sedata = build_symenc_data_packet(AES_KEY, cipher_algo=7, payload=b"\x0a\x00") ++ ++ # Build multiple packets ++ packets = [ ++ # First packet, legit. ++ build_tag1_packet(rsa, prefix), ++ ++ # Encrypted data packet, legit. ++ sedata, ++ ++ # Second packet: information payload. ++ # ++ # This packet contains a longer-crafted session key, able to trigger ++ # the overflow check in pgcrypto. This is the critical part, and ++ # and you are right to pay a lot of attention here if you are ++ # reading this code. ++ build_tag1_packet(rsa, prefix) ++ ] ++ ++ return b"".join(packets) ++ ++def main(): ++ # Default key size. ++ # This number can be set to a higher number if wanted, like 4096. We ++ # just do not need to do that here. ++ key_size = 2048 ++ ++ # Generate fresh RSA key pair ++ rsa = generate_rsa_keypair(key_size) ++ ++ # Generate the message data. ++ print("### Building message data", file=sys.stderr) ++ message_data = build_message_data(rsa) ++ ++ # Build the key containing the RSA private key ++ print("### Building key data", file=sys.stderr) ++ key_data = build_key_data(rsa) ++ ++ # Convert to hexadecimal, for the bytea used in the SQL file. ++ message_data = message_data.hex() ++ key_data = key_data.hex() ++ ++ # Split each value into lines of 72 characters, for readability. ++ message_data = re.sub("(.{72})", "\\1\n", message_data, 0, re.DOTALL) ++ key_data = re.sub("(.{72})", "\\1\n", key_data, 0, re.DOTALL) ++ ++ # Get the script filename for documentation ++ file_basename = os.path.basename(__file__) ++ ++ # Output the SQL test case ++ print(f'''-- Test for overflow with session key at decrypt. ++-- Data automatically generated by scripts/{file_basename}. ++-- See this file for details explaining how this data is generated. ++SELECT pgp_pub_decrypt_bytea( ++'\\x{message_data}'::bytea, ++'\\x{key_data}'::bytea);''', ++ file=sys.stdout) ++ ++if __name__ == "__main__": ++ main() +diff --git a/contrib/pgcrypto/sql/pgp-decrypt.sql b/contrib/pgcrypto/sql/pgp-decrypt.sql +index 557948d7c75..4901ab66539 100644 +--- a/contrib/pgcrypto/sql/pgp-decrypt.sql ++++ b/contrib/pgcrypto/sql/pgp-decrypt.sql +@@ -230,7 +230,7 @@ SaV9L04ky1qECNDx3XjnoKLC+H7IOQ== + '), '0123456789abcdefghij'), 'sha1'), 'hex'); + -- expected: da39a3ee5e6b4b0d3255bfef95601890afd80709 + +-select encode(digest(pgp_sym_decrypt(dearmor(' ++select encode(digest(pgp_sym_decrypt_bytea(dearmor(' + -----BEGIN PGP MESSAGE----- + Comment: dat3.aes.sha1.mdc.s2k3.z0 + +@@ -288,6 +288,27 @@ VsxxqLSPzNLAeIspJk5G + -- Routine text/binary mismatch. + select pgp_sym_decrypt(pgp_sym_encrypt_bytea('P', 'key'), 'key', 'debug=1'); + ++-- NUL byte in text decrypt. Ciphertext source: ++-- printf 'a\x00\xc' | gpg --homedir /nonexistent \ ++-- --personal-compress-preferences uncompressed --textmode \ ++-- --personal-cipher-preferences aes --no-emit-version --batch \ ++-- --symmetric --passphrase key --armor ++do $$ ++begin ++ perform pgp_sym_decrypt(dearmor(' ++-----BEGIN PGP MESSAGE----- ++ ++jA0EBwMCXLc8pozB10Fg0jQBVUID59TLvWutJp0j6eh9ZgjqIRzdYaIymFB8y4XH ++vu0YlJP5D5BX7yqZ+Pry7TlDmiFO ++=rV7z ++-----END PGP MESSAGE----- ++'), 'key', 'debug=1'); ++exception when others then ++ raise '%', ++ regexp_replace(sqlerrm, 'encoding "[^"]*"', 'encoding [REDACTED]'); ++end ++$$; ++ + -- Decryption with a certain incorrect key yields an apparent BZip2-compressed + -- plaintext. Ciphertext source: iterative pgp_sym_encrypt('secret', 'key') + -- until the random prefix gave rise to that property. +diff --git a/contrib/pgcrypto/sql/pgp-pubkey-session.sql b/contrib/pgcrypto/sql/pgp-pubkey-session.sql +new file mode 100644 +index 00000000000..51792f1f4d8 +--- /dev/null ++++ b/contrib/pgcrypto/sql/pgp-pubkey-session.sql +@@ -0,0 +1,46 @@ ++-- Test for overflow with session key at decrypt. ++-- Data automatically generated by scripts/pgp_session_data.py. ++-- See this file for details explaining how this data is generated. ++SELECT pgp_pub_decrypt_bytea( ++'\xc1c04c030000000000000000020800a46f5b9b1905b49457a6485474f71ed9b46c2527e1 ++da08e1f7871e12c3d38828f2076b984a595bf60f616599ca5729d547de06a258bfbbcd30 ++94a321e4668cd43010f0ca8ecf931e5d39bda1152c50c367b11c723f270729245d3ebdbd ++0694d320c5a5aa6a405fb45182acb3d7973cbce398e0c5060af7603cfd9ed186ebadd616 ++3b50ae42bea5f6d14dda24e6d4687b434c175084515d562e896742b0ba9a1c87d5642e10 ++a5550379c71cc490a052ada483b5d96526c0a600fc51755052aa77fdf72f7b4989b920e7 ++b90f4b30787a46482670d5caecc7a515a926055ad5509d135702ce51a0e4c1033f2d939d ++8f0075ec3428e17310da37d3d2d7ad1ce99adcc91cd446c366c402ae1ee38250343a7fcc ++0f8bc28020e603d7a4795ef0dcc1c04c030000000000000000020800a46f5b9b1905b494 ++57a6485474f71ed9b46c2527e1da08e1f7871e12c3d38828f2076b984a595bf60f616599 ++ca5729d547de06a258bfbbcd3094a321e4668cd43010f0ca8ecf931e5d39bda1152c50c3 ++67b11c723f270729245d3ebdbd0694d320c5a5aa6a405fb45182acb3d7973cbce398e0c5 ++060af7603cfd9ed186ebadd6163b50ae42bea5f6d14dda24e6d4687b434c175084515d56 ++2e896742b0ba9a1c87d5642e10a5550379c71cc490a052ada483b5d96526c0a600fc5175 ++5052aa77fdf72f7b4989b920e7b90f4b30787a46482670d5caecc7a515a926055ad5509d ++135702ce51a0e4c1033f2d939d8f0075ec3428e17310da37d3d2d7ad1ce99adc'::bytea, ++'\xc7c2d8046965d657020800eef8bf1515adb1a3ee7825f75c668ea8dd3e3f9d13e958f6ad ++9c55adc0c931a4bb00abe1d52cf7bb0c95d537949d277a5292ede375c6b2a67a3bf7d19f ++f975bb7e7be35c2d8300dacba360a0163567372f7dc24000cc7cb6170bedc8f3b1f98c12 ++07a6cb4de870a4bc61319b139dcc0e20c368fd68f8fd346d2c0b69c5aed560504e2ec6f1 ++23086fe3c5540dc4dd155c0c67257c4ada862f90fe172ace344089da8135e92aca5c2709 ++f1c1bc521798bb8c0365841496e709bd184132d387e0c9d5f26dc00fd06c3a76ef66a75c ++138285038684707a847b7bd33cfbefbf1d336be954a8048946af97a66352adef8e8b5ae4 ++c4748c6f2510265b7a8267bc370dbb00110100010007ff7e72d4f95d2d39901ac12ca5c5 ++18e767e719e72340c3fab51c8c5ab1c40f31db8eaffe43533fa61e2dbca2c3f4396c0847 ++e5434756acbb1f68128f4136bb135710c89137d74538908dac77967de9e821c559700dd9 ++de5a2727eec1f5d12d5d74869dd1de45ed369d94a8814d23861dd163f8c27744b26b98f0 ++239c2e6dd1e3493b8cc976fdc8f9a5e250f715aa4c3d7d5f237f8ee15d242e8fa941d1a0 ++ed9550ab632d992a97518d142802cb0a97b251319bf5742db8d9d8cbaa06cdfba2d75bc9 ++9d77a51ff20bd5ba7f15d7af6e85b904de2855d19af08d45f39deb85403033c69c767a8e ++74a343b1d6c8911d34ea441ac3850e57808ed3d885835cbe6c79d10400ef16256f3d5c4c ++3341516a2d2aa888df81b603f48a27f3666b40f992a857c1d11ff639cd764a9b42d5a1f8 ++58b4aeee36b85508bb5e8b91ef88a7737770b330224479d9b44eae8c631bc43628b69549 ++507c0a1af0be0dd7696015abea722b571eb35eefc4ab95595378ec12814727443f625fcd ++183bb9b3bccf53b54dd0e5e7a50400ffe08537b2d4e6074e4a1727b658cfccdec8962302 ++25e300c05690de45f7065c3d40d86f544a64d51a3e94424f9851a16d1322ebdb41fa8a45 ++3131f3e2dc94e858e6396722643df382680f815e53bcdcde5da622f50530a83b217f1103 ++cdd6e5e9babe1e415bbff28d44bd18c95f43bbd04afeb2a2a99af38a571c7540de21df03 ++ff62c0a33d9143dd3f639893f47732c11c5a12c6052d1935f4d507b7ae1f76ab0e9a69b8 ++7305a7f7c19bd509daf4903bff614bc26d118f03e461469c72c12d3a2bb4f78e4d342ce8 ++487723649a01ed2b9eb11c662134502c098d55dfcd361939d8370873422c3da75a515a75 ++9ffedfe7df44fb3c20f81650801a30d43b5c90b98b3eee'::bytea); +diff --git a/contrib/unaccent/unaccent.c b/contrib/unaccent/unaccent.c +index 0047efc075f..33c3a94874d 100644 +--- a/contrib/unaccent/unaccent.c ++++ b/contrib/unaccent/unaccent.c +@@ -149,9 +149,9 @@ initTrie(const char *filename) + state = 0; + for (ptr = line; *ptr; ptr += ptrlen) + { +- ptrlen = pg_mblen(ptr); ++ ptrlen = pg_mblen_cstr(ptr); + /* ignore whitespace, but end src or trg */ +- if (t_isspace(ptr)) ++ if (t_isspace_cstr(ptr)) + { + if (state == 1) + state = 2; +@@ -315,6 +315,7 @@ unaccent_lexize(PG_FUNCTION_ARGS) + char *srcchar = (char *) PG_GETARG_POINTER(1); + int32 len = PG_GETARG_INT32(2); + char *srcstart = srcchar; ++ const char *srcend = srcstart + len; + TSLexeme *res; + StringInfoData buf; + +@@ -342,7 +343,7 @@ unaccent_lexize(PG_FUNCTION_ARGS) + } + else + { +- matchlen = pg_mblen(srcchar); ++ matchlen = pg_mblen_range(srcchar, srcend); + if (buf.data != NULL) + appendBinaryStringInfo(&buf, srcchar, matchlen); + } +diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c +index a676966b877..6aa1b34d66e 100644 +--- a/src/backend/access/transam/multixact.c ++++ b/src/backend/access/transam/multixact.c +@@ -3306,7 +3306,6 @@ multixact_redo(XLogReaderState *record) + else if (info == XLOG_MULTIXACT_TRUNCATE_ID) + { + xl_multixact_truncate xlrec; +- int pageno; + + memcpy(&xlrec, XLogRecGetData(record), + SizeOfMultiXactTruncate); +@@ -3331,14 +3330,6 @@ multixact_redo(XLogReaderState *record) + SetMultiXactIdLimit(xlrec.endTruncOff, xlrec.oldestMultiDB, false); + + PerformMembersTruncation(xlrec.startTruncMemb, xlrec.endTruncMemb); +- +- /* +- * During XLOG replay, latest_page_number isn't necessarily set up +- * yet; insert a suitable value to bypass the sanity test in +- * SimpleLruTruncate. +- */ +- pageno = MultiXactIdToOffsetPage(xlrec.endTruncOff); +- MultiXactOffsetCtl->shared->latest_page_number = pageno; + PerformOffsetsTruncation(xlrec.startTruncOff, xlrec.endTruncOff); + + LWLockRelease(MultiXactTruncationLock); +diff --git a/src/backend/catalog/pg_depend.c b/src/backend/catalog/pg_depend.c +index 38615647253..fcd3860f7db 100644 +--- a/src/backend/catalog/pg_depend.c ++++ b/src/backend/catalog/pg_depend.c +@@ -22,11 +22,13 @@ + #include "catalog/pg_constraint.h" + #include "catalog/pg_depend.h" + #include "catalog/pg_extension.h" ++#include "catalog/pg_type.h" + #include "commands/extension.h" + #include "miscadmin.h" + #include "utils/fmgroids.h" + #include "utils/lsyscache.h" + #include "utils/rel.h" ++#include "utils/syscache.h" + + + static bool isObjectPinned(const ObjectAddress *object, Relation rel); +@@ -804,6 +806,77 @@ getAutoExtensionsOfObject(Oid classId, Oid objectId) + return result; + } + ++/* ++ * Look up a type belonging to an extension. ++ * ++ * Returns the type's OID, or InvalidOid if not found. ++ * ++ * Notice that the type is specified by name only, without a schema. ++ * That's because this will typically be used by relocatable extensions ++ * which can't make a-priori assumptions about which schema their objects ++ * are in. As long as the extension only defines one type of this name, ++ * the answer is unique anyway. ++ * ++ * We might later add the ability to look up functions, operators, etc. ++ */ ++Oid ++getExtensionType(Oid extensionOid, const char *typname) ++{ ++ Oid result = InvalidOid; ++ Relation depRel; ++ ScanKeyData key[3]; ++ SysScanDesc scan; ++ HeapTuple tup; ++ ++ depRel = table_open(DependRelationId, AccessShareLock); ++ ++ ScanKeyInit(&key[0], ++ Anum_pg_depend_refclassid, ++ BTEqualStrategyNumber, F_OIDEQ, ++ ObjectIdGetDatum(ExtensionRelationId)); ++ ScanKeyInit(&key[1], ++ Anum_pg_depend_refobjid, ++ BTEqualStrategyNumber, F_OIDEQ, ++ ObjectIdGetDatum(extensionOid)); ++ ScanKeyInit(&key[2], ++ Anum_pg_depend_refobjsubid, ++ BTEqualStrategyNumber, F_INT4EQ, ++ Int32GetDatum(0)); ++ ++ scan = systable_beginscan(depRel, DependReferenceIndexId, true, ++ NULL, 3, key); ++ ++ while (HeapTupleIsValid(tup = systable_getnext(scan))) ++ { ++ Form_pg_depend depform = (Form_pg_depend) GETSTRUCT(tup); ++ ++ if (depform->classid == TypeRelationId && ++ depform->deptype == DEPENDENCY_EXTENSION) ++ { ++ Oid typoid = depform->objid; ++ HeapTuple typtup; ++ ++ typtup = SearchSysCache1(TYPEOID, ObjectIdGetDatum(typoid)); ++ if (!HeapTupleIsValid(typtup)) ++ continue; /* should we throw an error? */ ++ if (strcmp(NameStr(((Form_pg_type) GETSTRUCT(typtup))->typname), ++ typname) == 0) ++ { ++ result = typoid; ++ ReleaseSysCache(typtup); ++ break; /* no need to keep searching */ ++ } ++ ReleaseSysCache(typtup); ++ } ++ } ++ ++ systable_endscan(scan); ++ ++ table_close(depRel, AccessShareLock); ++ ++ return result; ++} ++ + /* + * Detect whether a sequence is marked as "owned" by a column + * +diff --git a/src/backend/catalog/pg_proc.c b/src/backend/catalog/pg_proc.c +index 0d754874c1e..1818906b54a 100644 +--- a/src/backend/catalog/pg_proc.c ++++ b/src/backend/catalog/pg_proc.c +@@ -1129,7 +1129,7 @@ match_prosrc_to_literal(const char *prosrc, const char *literal, + if (cursorpos > 0) + newcp++; + } +- chlen = pg_mblen(prosrc); ++ chlen = pg_mblen_cstr(prosrc); + if (strncmp(prosrc, literal, chlen) != 0) + goto fail; + prosrc += chlen; +diff --git a/src/backend/commands/extension.c b/src/backend/commands/extension.c +index ccffec91132..a00cdcf6435 100644 +--- a/src/backend/commands/extension.c ++++ b/src/backend/commands/extension.c +@@ -45,6 +45,7 @@ + #include "catalog/pg_depend.h" + #include "catalog/pg_extension.h" + #include "catalog/pg_namespace.h" ++#include "catalog/pg_proc.h" + #include "catalog/pg_type.h" + #include "commands/alter.h" + #include "commands/comment.h" +@@ -60,10 +61,12 @@ + #include "utils/acl.h" + #include "utils/builtins.h" + #include "utils/fmgroids.h" ++#include "utils/inval.h" + #include "utils/lsyscache.h" + #include "utils/memutils.h" + #include "utils/rel.h" + #include "utils/snapmgr.h" ++#include "utils/syscache.h" + #include "utils/varlena.h" + + +@@ -104,7 +107,26 @@ typedef struct ExtensionVersionInfo + struct ExtensionVersionInfo *previous; /* current best predecessor */ + } ExtensionVersionInfo; + ++/* ++ * Cache structure for get_function_sibling_type (and maybe later, ++ * allied lookup functions). ++ */ ++typedef struct ExtensionSiblingCache ++{ ++ struct ExtensionSiblingCache *next; /* list link */ ++ /* lookup key: requesting function's OID and type name */ ++ Oid reqfuncoid; ++ const char *typname; ++ bool valid; /* is entry currently valid? */ ++ uint32 exthash; /* cache hash of owning extension's OID */ ++ Oid typeoid; /* OID associated with typname */ ++} ExtensionSiblingCache; ++ ++/* Head of linked list of ExtensionSiblingCache structs */ ++static ExtensionSiblingCache *ext_sibling_list = NULL; ++ + /* Local functions */ ++static void ext_sibling_callback(Datum arg, int cacheid, uint32 hashvalue); + static List *find_update_path(List *evi_list, + ExtensionVersionInfo *evi_start, + ExtensionVersionInfo *evi_target, +@@ -254,6 +276,114 @@ get_extension_schema(Oid ext_oid) + return result; + } + ++/* ++ * get_function_sibling_type - find a type belonging to same extension as func ++ * ++ * Returns the type's OID, or InvalidOid if not found. ++ * ++ * This is useful in extensions, which won't have fixed object OIDs. ++ * We work from the calling function's own OID, which it can get from its ++ * FunctionCallInfo parameter, and look up the owning extension and thence ++ * a type belonging to the same extension. ++ * ++ * Notice that the type is specified by name only, without a schema. ++ * That's because this will typically be used by relocatable extensions ++ * which can't make a-priori assumptions about which schema their objects ++ * are in. As long as the extension only defines one type of this name, ++ * the answer is unique anyway. ++ * ++ * We might later add the ability to look up functions, operators, etc. ++ * ++ * This code is simply a frontend for some pg_depend lookups. Those lookups ++ * are fairly expensive, so we provide a simple cache facility. We assume ++ * that the passed typname is actually a C constant, or at least permanently ++ * allocated, so that we need not copy that string. ++ */ ++Oid ++get_function_sibling_type(Oid funcoid, const char *typname) ++{ ++ ExtensionSiblingCache *cache_entry; ++ Oid extoid; ++ Oid typeoid; ++ ++ /* ++ * See if we have the answer cached. Someday there may be enough callers ++ * to justify a hash table, but for now, a simple linked list is fine. ++ */ ++ for (cache_entry = ext_sibling_list; cache_entry != NULL; ++ cache_entry = cache_entry->next) ++ { ++ if (funcoid == cache_entry->reqfuncoid && ++ strcmp(typname, cache_entry->typname) == 0) ++ break; ++ } ++ if (cache_entry && cache_entry->valid) ++ return cache_entry->typeoid; ++ ++ /* ++ * Nope, so do the expensive lookups. We do not expect failures, so we do ++ * not cache negative results. ++ */ ++ extoid = getExtensionOfObject(ProcedureRelationId, funcoid); ++ if (!OidIsValid(extoid)) ++ return InvalidOid; ++ typeoid = getExtensionType(extoid, typname); ++ if (!OidIsValid(typeoid)) ++ return InvalidOid; ++ ++ /* ++ * Build, or revalidate, cache entry. ++ */ ++ if (cache_entry == NULL) ++ { ++ /* Register invalidation hook if this is first entry */ ++ if (ext_sibling_list == NULL) ++ CacheRegisterSyscacheCallback(EXTENSIONOID, ++ ext_sibling_callback, ++ (Datum) 0); ++ ++ /* Momentarily zero the space to ensure valid flag is false */ ++ cache_entry = (ExtensionSiblingCache *) ++ MemoryContextAllocZero(CacheMemoryContext, ++ sizeof(ExtensionSiblingCache)); ++ cache_entry->next = ext_sibling_list; ++ ext_sibling_list = cache_entry; ++ } ++ ++ cache_entry->reqfuncoid = funcoid; ++ cache_entry->typname = typname; ++ cache_entry->exthash = GetSysCacheHashValue1(EXTENSIONOID, ++ ObjectIdGetDatum(extoid)); ++ cache_entry->typeoid = typeoid; ++ /* Mark it valid only once it's fully populated */ ++ cache_entry->valid = true; ++ ++ return typeoid; ++} ++ ++/* ++ * ext_sibling_callback ++ * Syscache inval callback function for EXTENSIONOID cache ++ * ++ * It seems sufficient to invalidate ExtensionSiblingCache entries when ++ * the owning extension's pg_extension entry is modified or deleted. ++ * Neither a requesting function's OID, nor the OID of the object it's ++ * looking for, could change without an extension update or drop/recreate. ++ */ ++static void ++ext_sibling_callback(Datum arg, int cacheid, uint32 hashvalue) ++{ ++ ExtensionSiblingCache *cache_entry; ++ ++ for (cache_entry = ext_sibling_list; cache_entry != NULL; ++ cache_entry = cache_entry->next) ++ { ++ if (hashvalue == 0 || ++ cache_entry->exthash == hashvalue) ++ cache_entry->valid = false; ++ } ++} ++ + /* + * Utility functions to check validity of extension and version names + */ +diff --git a/src/backend/commands/operatorcmds.c b/src/backend/commands/operatorcmds.c +index 640b22ad936..10a30d235dc 100644 +--- a/src/backend/commands/operatorcmds.c ++++ b/src/backend/commands/operatorcmds.c +@@ -262,7 +262,6 @@ ValidateRestrictionEstimator(List *restrictionName) + { + Oid typeId[4]; + Oid restrictionOid; +- AclResult aclresult; + + typeId[0] = INTERNALOID; /* PlannerInfo */ + typeId[1] = OIDOID; /* operator OID */ +@@ -278,11 +277,32 @@ ValidateRestrictionEstimator(List *restrictionName) + errmsg("restriction estimator function %s must return type %s", + NameListToString(restrictionName), "float8"))); + +- /* Require EXECUTE rights for the estimator */ +- aclresult = pg_proc_aclcheck(restrictionOid, GetUserId(), ACL_EXECUTE); +- if (aclresult != ACLCHECK_OK) +- aclcheck_error(aclresult, OBJECT_FUNCTION, +- NameListToString(restrictionName)); ++ /* ++ * If the estimator is not a built-in function, require superuser ++ * privilege to install it. This protects against using something that is ++ * not a restriction estimator or has hard-wired assumptions about what ++ * data types it is working with. (Built-in estimators are required to ++ * defend themselves adequately against unexpected data type choices, but ++ * it seems impractical to expect that of extensions' estimators.) ++ * ++ * If it is built-in, only require EXECUTE rights. ++ */ ++ if (restrictionOid >= FirstGenbkiObjectId) ++ { ++ if (!superuser()) ++ ereport(ERROR, ++ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), ++ errmsg("must be superuser to specify a non-built-in restriction estimator function"))); ++ } ++ else ++ { ++ AclResult aclresult; ++ ++ aclresult = pg_proc_aclcheck(restrictionOid, GetUserId(), ACL_EXECUTE); ++ if (aclresult != ACLCHECK_OK) ++ aclcheck_error(aclresult, OBJECT_FUNCTION, ++ NameListToString(restrictionName)); ++ } + + return restrictionOid; + } +@@ -298,7 +318,6 @@ ValidateJoinEstimator(List *joinName) + Oid typeId[5]; + Oid joinOid; + Oid joinOid2; +- AclResult aclresult; + + typeId[0] = INTERNALOID; /* PlannerInfo */ + typeId[1] = OIDOID; /* operator OID */ +@@ -336,11 +355,23 @@ ValidateJoinEstimator(List *joinName) + errmsg("join estimator function %s must return type %s", + NameListToString(joinName), "float8"))); + +- /* Require EXECUTE rights for the estimator */ +- aclresult = pg_proc_aclcheck(joinOid, GetUserId(), ACL_EXECUTE); +- if (aclresult != ACLCHECK_OK) +- aclcheck_error(aclresult, OBJECT_FUNCTION, +- NameListToString(joinName)); ++ /* privilege checks are the same as in ValidateRestrictionEstimator */ ++ if (joinOid >= FirstGenbkiObjectId) ++ { ++ if (!superuser()) ++ ereport(ERROR, ++ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), ++ errmsg("must be superuser to specify a non-built-in join estimator function"))); ++ } ++ else ++ { ++ AclResult aclresult; ++ ++ aclresult = pg_proc_aclcheck(joinOid, GetUserId(), ACL_EXECUTE); ++ if (aclresult != ACLCHECK_OK) ++ aclcheck_error(aclresult, OBJECT_FUNCTION, ++ NameListToString(joinName)); ++ } + + return joinOid; + } +diff --git a/src/backend/tsearch/dict_synonym.c b/src/backend/tsearch/dict_synonym.c +index e732e66dace..d798610a8a9 100644 +--- a/src/backend/tsearch/dict_synonym.c ++++ b/src/backend/tsearch/dict_synonym.c +@@ -47,8 +47,8 @@ findwrd(char *in, char **end, uint16 *flags) + char *lastchar; + + /* Skip leading spaces */ +- while (*in && t_isspace(in)) +- in += pg_mblen(in); ++ while (*in && t_isspace_cstr(in)) ++ in += pg_mblen_cstr(in); + + /* Return NULL on empty lines */ + if (*in == '\0') +@@ -60,10 +60,10 @@ findwrd(char *in, char **end, uint16 *flags) + lastchar = start = in; + + /* Find end of word */ +- while (*in && !t_isspace(in)) ++ while (*in && !t_isspace_cstr(in)) + { + lastchar = in; +- in += pg_mblen(in); ++ in += pg_mblen_cstr(in); + } + + if (in - lastchar == 1 && t_iseq(lastchar, '*') && flags) +diff --git a/src/backend/tsearch/dict_thesaurus.c b/src/backend/tsearch/dict_thesaurus.c +index cb0835982d8..9a29b22d7e9 100644 +--- a/src/backend/tsearch/dict_thesaurus.c ++++ b/src/backend/tsearch/dict_thesaurus.c +@@ -190,8 +190,8 @@ thesaurusRead(const char *filename, DictThesaurus *d) + ptr = line; + + /* is it a comment? */ +- while (*ptr && t_isspace(ptr)) +- ptr += pg_mblen(ptr); ++ while (*ptr && t_isspace_cstr(ptr)) ++ ptr += pg_mblen_cstr(ptr); + + if (t_iseq(ptr, '#') || *ptr == '\0' || + t_iseq(ptr, '\n') || t_iseq(ptr, '\r')) +@@ -212,7 +212,7 @@ thesaurusRead(const char *filename, DictThesaurus *d) + errmsg("unexpected delimiter"))); + state = TR_WAITSUBS; + } +- else if (!t_isspace(ptr)) ++ else if (!t_isspace_cstr(ptr)) + { + beginwrd = ptr; + state = TR_INLEX; +@@ -225,7 +225,7 @@ thesaurusRead(const char *filename, DictThesaurus *d) + newLexeme(d, beginwrd, ptr, idsubst, posinsubst++); + state = TR_WAITSUBS; + } +- else if (t_isspace(ptr)) ++ else if (t_isspace_cstr(ptr)) + { + newLexeme(d, beginwrd, ptr, idsubst, posinsubst++); + state = TR_WAITLEX; +@@ -237,15 +237,15 @@ thesaurusRead(const char *filename, DictThesaurus *d) + { + useasis = true; + state = TR_INSUBS; +- beginwrd = ptr + pg_mblen(ptr); ++ beginwrd = ptr + pg_mblen_cstr(ptr); + } + else if (t_iseq(ptr, '\\')) + { + useasis = false; + state = TR_INSUBS; +- beginwrd = ptr + pg_mblen(ptr); ++ beginwrd = ptr + pg_mblen_cstr(ptr); + } +- else if (!t_isspace(ptr)) ++ else if (!t_isspace_cstr(ptr)) + { + useasis = false; + beginwrd = ptr; +@@ -254,7 +254,7 @@ thesaurusRead(const char *filename, DictThesaurus *d) + } + else if (state == TR_INSUBS) + { +- if (t_isspace(ptr)) ++ if (t_isspace_cstr(ptr)) + { + if (ptr == beginwrd) + ereport(ERROR, +@@ -267,7 +267,7 @@ thesaurusRead(const char *filename, DictThesaurus *d) + else + elog(ERROR, "unrecognized thesaurus state: %d", state); + +- ptr += pg_mblen(ptr); ++ ptr += pg_mblen_cstr(ptr); + } + + if (state == TR_INSUBS) +diff --git a/src/backend/tsearch/regis.c b/src/backend/tsearch/regis.c +index 2edd4faa8ec..a9b64fccd6e 100644 +--- a/src/backend/tsearch/regis.c ++++ b/src/backend/tsearch/regis.c +@@ -37,7 +37,7 @@ RS_isRegis(const char *str) + { + if (state == RS_IN_WAIT) + { +- if (t_isalpha(c)) ++ if (t_isalpha_cstr(c)) + /* okay */ ; + else if (t_iseq(c, '[')) + state = RS_IN_ONEOF; +@@ -48,14 +48,14 @@ RS_isRegis(const char *str) + { + if (t_iseq(c, '^')) + state = RS_IN_NONEOF; +- else if (t_isalpha(c)) ++ else if (t_isalpha_cstr(c)) + state = RS_IN_ONEOF_IN; + else + return false; + } + else if (state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF) + { +- if (t_isalpha(c)) ++ if (t_isalpha_cstr(c)) + /* okay */ ; + else if (t_iseq(c, ']')) + state = RS_IN_WAIT; +@@ -64,7 +64,7 @@ RS_isRegis(const char *str) + } + else + elog(ERROR, "internal error in RS_isRegis: state %d", state); +- c += pg_mblen(c); ++ c += pg_mblen_cstr(c); + } + + return (state == RS_IN_WAIT); +@@ -96,15 +96,14 @@ RS_compile(Regis *r, bool issuffix, const char *str) + { + if (state == RS_IN_WAIT) + { +- if (t_isalpha(c)) ++ if (t_isalpha_cstr(c)) + { + if (ptr) + ptr = newRegisNode(ptr, len); + else + ptr = r->node = newRegisNode(NULL, len); +- COPYCHAR(ptr->data, c); + ptr->type = RSF_ONEOF; +- ptr->len = pg_mblen(c); ++ ptr->len = ts_copychar_cstr(ptr->data, c); + } + else if (t_iseq(c, '[')) + { +@@ -125,10 +124,9 @@ RS_compile(Regis *r, bool issuffix, const char *str) + ptr->type = RSF_NONEOF; + state = RS_IN_NONEOF; + } +- else if (t_isalpha(c)) ++ else if (t_isalpha_cstr(c)) + { +- COPYCHAR(ptr->data, c); +- ptr->len = pg_mblen(c); ++ ptr->len = ts_copychar_cstr(ptr->data, c); + state = RS_IN_ONEOF_IN; + } + else /* shouldn't get here */ +@@ -136,11 +134,8 @@ RS_compile(Regis *r, bool issuffix, const char *str) + } + else if (state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF) + { +- if (t_isalpha(c)) +- { +- COPYCHAR(ptr->data + ptr->len, c); +- ptr->len += pg_mblen(c); +- } ++ if (t_isalpha_cstr(c)) ++ ptr->len += ts_copychar_cstr(ptr->data + ptr->len, c); + else if (t_iseq(c, ']')) + state = RS_IN_WAIT; + else /* shouldn't get here */ +@@ -148,7 +143,7 @@ RS_compile(Regis *r, bool issuffix, const char *str) + } + else + elog(ERROR, "internal error in RS_compile: state %d", state); +- c += pg_mblen(c); ++ c += pg_mblen_cstr(c); + } + + if (state != RS_IN_WAIT) /* shouldn't get here */ +@@ -187,10 +182,10 @@ mb_strchr(char *str, char *c) + char *ptr = str; + bool res = false; + +- clen = pg_mblen(c); ++ clen = pg_mblen_cstr(c); + while (*ptr && !res) + { +- plen = pg_mblen(ptr); ++ plen = pg_mblen_cstr(ptr); + if (plen == clen) + { + i = plen; +@@ -219,7 +214,7 @@ RS_execute(Regis *r, char *str) + while (*c) + { + len++; +- c += pg_mblen(c); ++ c += pg_mblen_cstr(c); + } + + if (len < r->nchar) +@@ -230,7 +225,7 @@ RS_execute(Regis *r, char *str) + { + len -= r->nchar; + while (len-- > 0) +- c += pg_mblen(c); ++ c += pg_mblen_cstr(c); + } + + +@@ -250,7 +245,7 @@ RS_execute(Regis *r, char *str) + elog(ERROR, "unrecognized regis node type: %d", ptr->type); + } + ptr = ptr->next; +- c += pg_mblen(c); ++ c += pg_mblen_cstr(c); + } + + return true; +diff --git a/src/backend/tsearch/spell.c b/src/backend/tsearch/spell.c +index 7c45e3206d4..b2d564ad5e1 100644 +--- a/src/backend/tsearch/spell.c ++++ b/src/backend/tsearch/spell.c +@@ -232,7 +232,7 @@ findchar(char *str, int c) + { + if (t_iseq(str, c)) + return str; +- str += pg_mblen(str); ++ str += pg_mblen_cstr(str); + } + + return NULL; +@@ -245,7 +245,7 @@ findchar2(char *str, int c1, int c2) + { + if (t_iseq(str, c1) || t_iseq(str, c2)) + return str; +- str += pg_mblen(str); ++ str += pg_mblen_cstr(str); + } + + return NULL; +@@ -352,6 +352,7 @@ getNextFlagFromString(IspellDict *Conf, char **sflagset, char *sflag) + char *next, + *sbuf = *sflagset; + int maxstep; ++ int clen; + bool stop = false; + bool met_comma = false; + +@@ -363,11 +364,11 @@ getNextFlagFromString(IspellDict *Conf, char **sflagset, char *sflag) + { + case FM_LONG: + case FM_CHAR: +- COPYCHAR(sflag, *sflagset); +- sflag += pg_mblen(*sflagset); ++ clen = ts_copychar_cstr(sflag, *sflagset); ++ sflag += clen; + + /* Go to start of the next flag */ +- *sflagset += pg_mblen(*sflagset); ++ *sflagset += clen; + + /* Check if we get all characters of flag */ + maxstep--; +@@ -391,7 +392,7 @@ getNextFlagFromString(IspellDict *Conf, char **sflagset, char *sflag) + *sflagset = next; + while (**sflagset) + { +- if (t_isdigit(*sflagset)) ++ if (t_isdigit_cstr(*sflagset)) + { + if (!met_comma) + ereport(ERROR, +@@ -409,7 +410,7 @@ getNextFlagFromString(IspellDict *Conf, char **sflagset, char *sflag) + *sflagset))); + met_comma = true; + } +- else if (!t_isspace(*sflagset)) ++ else if (!t_isspace_cstr(*sflagset)) + { + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), +@@ -417,7 +418,7 @@ getNextFlagFromString(IspellDict *Conf, char **sflagset, char *sflag) + *sflagset))); + } + +- *sflagset += pg_mblen(*sflagset); ++ *sflagset += pg_mblen_cstr(*sflagset); + } + stop = true; + break; +@@ -543,7 +544,7 @@ NIImportDictionary(IspellDict *Conf, const char *filename) + while (*s) + { + /* we allow only single encoded flags for faster works */ +- if (pg_mblen(s) == 1 && t_isprint(s) && !t_isspace(s)) ++ if (pg_mblen_cstr(s) == 1 && t_isprint_unbounded(s) && !t_isspace_unbounded(s)) + s++; + else + { +@@ -559,12 +560,12 @@ NIImportDictionary(IspellDict *Conf, const char *filename) + s = line; + while (*s) + { +- if (t_isspace(s)) ++ if (t_isspace_cstr(s)) + { + *s = '\0'; + break; + } +- s += pg_mblen(s); ++ s += pg_mblen_cstr(s); + } + pstr = lowerstr_ctx(Conf, line); + +@@ -816,17 +817,17 @@ get_nextfield(char **str, char *next) + + while (**str) + { ++ int clen = pg_mblen_cstr(*str); ++ + if (state == PAE_WAIT_MASK) + { + if (t_iseq(*str, '#')) + return false; +- else if (!t_isspace(*str)) ++ else if (!t_isspace_cstr(*str)) + { +- int clen = pg_mblen(*str); +- + if (clen < avail) + { +- COPYCHAR(next, *str); ++ ts_copychar_with_len(next, *str, clen); + next += clen; + avail -= clen; + } +@@ -835,24 +836,22 @@ get_nextfield(char **str, char *next) + } + else /* state == PAE_INMASK */ + { +- if (t_isspace(*str)) ++ if (t_isspace_cstr(*str)) + { + *next = '\0'; + return true; + } + else + { +- int clen = pg_mblen(*str); +- + if (clen < avail) + { +- COPYCHAR(next, *str); ++ ts_copychar_with_len(next, *str, clen); + next += clen; + avail -= clen; + } + } + } +- *str += pg_mblen(*str); ++ *str += clen; + } + + *next = '\0'; +@@ -942,14 +941,15 @@ parse_affentry(char *str, char *mask, char *find, char *repl) + + while (*str) + { ++ int clen = pg_mblen_cstr(str); ++ + if (state == PAE_WAIT_MASK) + { + if (t_iseq(str, '#')) + return false; +- else if (!t_isspace(str)) ++ else if (!t_isspace_cstr(str)) + { +- COPYCHAR(pmask, str); +- pmask += pg_mblen(str); ++ pmask += ts_copychar_with_len(pmask, str, clen); + state = PAE_INMASK; + } + } +@@ -960,10 +960,9 @@ parse_affentry(char *str, char *mask, char *find, char *repl) + *pmask = '\0'; + state = PAE_WAIT_FIND; + } +- else if (!t_isspace(str)) ++ else if (!t_isspace_cstr(str)) + { +- COPYCHAR(pmask, str); +- pmask += pg_mblen(str); ++ pmask += ts_copychar_with_len(pmask, str, clen); + } + } + else if (state == PAE_WAIT_FIND) +@@ -972,13 +971,12 @@ parse_affentry(char *str, char *mask, char *find, char *repl) + { + state = PAE_INFIND; + } +- else if (t_isalpha(str) || t_iseq(str, '\'') /* english 's */ ) ++ else if (t_isalpha_cstr(str) || t_iseq(str, '\'') /* english 's */ ) + { +- COPYCHAR(prepl, str); +- prepl += pg_mblen(str); ++ prepl += ts_copychar_with_len(prepl, str, clen); + state = PAE_INREPL; + } +- else if (!t_isspace(str)) ++ else if (!t_isspace_cstr(str)) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("syntax error"))); +@@ -990,12 +988,11 @@ parse_affentry(char *str, char *mask, char *find, char *repl) + *pfind = '\0'; + state = PAE_WAIT_REPL; + } +- else if (t_isalpha(str)) ++ else if (t_isalpha_cstr(str)) + { +- COPYCHAR(pfind, str); +- pfind += pg_mblen(str); ++ pfind += ts_copychar_with_len(pfind, str, clen); + } +- else if (!t_isspace(str)) ++ else if (!t_isspace_cstr(str)) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("syntax error"))); +@@ -1006,13 +1003,12 @@ parse_affentry(char *str, char *mask, char *find, char *repl) + { + break; /* void repl */ + } +- else if (t_isalpha(str)) ++ else if (t_isalpha_cstr(str)) + { +- COPYCHAR(prepl, str); +- prepl += pg_mblen(str); ++ prepl += ts_copychar_with_len(prepl, str, clen); + state = PAE_INREPL; + } +- else if (!t_isspace(str)) ++ else if (!t_isspace_cstr(str)) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("syntax error"))); +@@ -1024,12 +1020,11 @@ parse_affentry(char *str, char *mask, char *find, char *repl) + *prepl = '\0'; + break; + } +- else if (t_isalpha(str)) ++ else if (t_isalpha_cstr(str)) + { +- COPYCHAR(prepl, str); +- prepl += pg_mblen(str); ++ prepl += ts_copychar_with_len(prepl, str, clen); + } +- else if (!t_isspace(str)) ++ else if (!t_isspace_cstr(str)) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("syntax error"))); +@@ -1037,7 +1032,7 @@ parse_affentry(char *str, char *mask, char *find, char *repl) + else + elog(ERROR, "unrecognized state in parse_affentry: %d", state); + +- str += pg_mblen(str); ++ str += clen; + } + + *pmask = *pfind = *prepl = '\0'; +@@ -1090,10 +1085,9 @@ addCompoundAffixFlagValue(IspellDict *Conf, char *s, uint32 val) + CompoundAffixFlag *newValue; + char sbuf[BUFSIZ]; + char *sflag; +- int clen; + +- while (*s && t_isspace(s)) +- s += pg_mblen(s); ++ while (*s && t_isspace_cstr(s)) ++ s += pg_mblen_cstr(s); + + if (!*s) + ereport(ERROR, +@@ -1102,10 +1096,10 @@ addCompoundAffixFlagValue(IspellDict *Conf, char *s, uint32 val) + + /* Get flag without \n */ + sflag = sbuf; +- while (*s && !t_isspace(s) && *s != '\n') ++ while (*s && !t_isspace_cstr(s) && *s != '\n') + { +- clen = pg_mblen(s); +- COPYCHAR(sflag, s); ++ int clen = ts_copychar_cstr(sflag, s); ++ + sflag += clen; + s += clen; + } +@@ -1248,7 +1242,7 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename) + + while ((recoded = tsearch_readline(&trst)) != NULL) + { +- if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#')) ++ if (*recoded == '\0' || t_isspace_cstr(recoded) || t_iseq(recoded, '#')) + { + pfree(recoded); + continue; +@@ -1285,8 +1279,8 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename) + { + char *s = recoded + strlen("FLAG"); + +- while (*s && t_isspace(s)) +- s += pg_mblen(s); ++ while (*s && t_isspace_cstr(s)) ++ s += pg_mblen_cstr(s); + + if (*s) + { +@@ -1321,7 +1315,7 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename) + { + int fields_read; + +- if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#')) ++ if (*recoded == '\0' || t_isspace_cstr(recoded) || t_iseq(recoded, '#')) + goto nextline; + + fields_read = parse_ooaffentry(recoded, type, sflag, find, repl, mask); +@@ -1484,12 +1478,12 @@ NIImportAffixes(IspellDict *Conf, const char *filename) + s = findchar2(recoded, 'l', 'L'); + if (s) + { +- while (*s && !t_isspace(s)) +- s += pg_mblen(s); +- while (*s && t_isspace(s)) +- s += pg_mblen(s); ++ while (*s && !t_isspace_cstr(s)) ++ s += pg_mblen_cstr(s); ++ while (*s && t_isspace_cstr(s)) ++ s += pg_mblen_cstr(s); + +- if (*s && pg_mblen(s) == 1) ++ if (*s && pg_mblen_cstr(s) == 1) + { + addCompoundAffixFlagValue(Conf, s, FF_COMPOUNDFLAG); + Conf->usecompound = true; +@@ -1517,8 +1511,8 @@ NIImportAffixes(IspellDict *Conf, const char *filename) + s = recoded + 4; /* we need non-lowercased string */ + flagflags = 0; + +- while (*s && t_isspace(s)) +- s += pg_mblen(s); ++ while (*s && t_isspace_cstr(s)) ++ s += pg_mblen_cstr(s); + + if (*s == '*') + { +@@ -1539,14 +1533,13 @@ NIImportAffixes(IspellDict *Conf, const char *filename) + * be followed by EOL, whitespace, or ':'. Otherwise this is a + * new-format flag command. + */ +- if (*s && pg_mblen(s) == 1) ++ if (*s && pg_mblen_cstr(s) == 1) + { +- COPYCHAR(flag, s); ++ flag[0] = *s++; + flag[1] = '\0'; + +- s++; + if (*s == '\0' || *s == '#' || *s == '\n' || *s == ':' || +- t_isspace(s)) ++ t_isspace_cstr(s)) + { + oldformat = true; + goto nextline; +@@ -1769,7 +1762,7 @@ NISortDictionary(IspellDict *Conf) + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("invalid affix alias \"%s\"", + Conf->Spell[i]->p.flag))); +- if (*end != '\0' && !t_isdigit(end) && !t_isspace(end)) ++ if (*end != '\0' && !t_isdigit_cstr(end) && !t_isspace_cstr(end)) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("invalid affix alias \"%s\"", +diff --git a/src/backend/tsearch/ts_locale.c b/src/backend/tsearch/ts_locale.c +index 9b199d0ac18..3b189fc68d1 100644 +--- a/src/backend/tsearch/ts_locale.c ++++ b/src/backend/tsearch/ts_locale.c +@@ -32,70 +32,43 @@ static void tsearch_readline_callback(void *arg); + */ + #define WC_BUF_LEN 3 + +-int +-t_isdigit(const char *ptr) +-{ +- int clen = pg_mblen(ptr); +- wchar_t character[WC_BUF_LEN]; +- Oid collation = DEFAULT_COLLATION_OID; /* TODO */ +- pg_locale_t mylocale = 0; /* TODO */ +- +- if (clen == 1 || lc_ctype_is_c(collation)) +- return isdigit(TOUCHAR(ptr)); +- +- char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); +- +- return iswdigit((wint_t) character[0]); +-} +- +-int +-t_isspace(const char *ptr) +-{ +- int clen = pg_mblen(ptr); +- wchar_t character[WC_BUF_LEN]; +- Oid collation = DEFAULT_COLLATION_OID; /* TODO */ +- pg_locale_t mylocale = 0; /* TODO */ +- +- if (clen == 1 || lc_ctype_is_c(collation)) +- return isspace(TOUCHAR(ptr)); +- +- char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); +- +- return iswspace((wint_t) character[0]); +-} +- +-int +-t_isalpha(const char *ptr) +-{ +- int clen = pg_mblen(ptr); +- wchar_t character[WC_BUF_LEN]; +- Oid collation = DEFAULT_COLLATION_OID; /* TODO */ +- pg_locale_t mylocale = 0; /* TODO */ +- +- if (clen == 1 || lc_ctype_is_c(collation)) +- return isalpha(TOUCHAR(ptr)); +- +- char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); +- +- return iswalpha((wint_t) character[0]); +-} +- +-int +-t_isprint(const char *ptr) +-{ +- int clen = pg_mblen(ptr); +- wchar_t character[WC_BUF_LEN]; +- Oid collation = DEFAULT_COLLATION_OID; /* TODO */ +- pg_locale_t mylocale = 0; /* TODO */ +- +- if (clen == 1 || lc_ctype_is_c(collation)) +- return isprint(TOUCHAR(ptr)); +- +- char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); +- +- return iswprint((wint_t) character[0]); ++#define GENERATE_T_ISCLASS_DEF(character_class) \ ++/* mblen shall be that of the first character */ \ ++int \ ++t_is##character_class##_with_len(const char *ptr, int mblen) \ ++{ \ ++ int clen = pg_mblen_with_len(ptr, mblen); \ ++ wchar_t character[WC_BUF_LEN]; \ ++ pg_locale_t mylocale = 0; /* TODO */ \ ++ if (clen == 1 || lc_ctype_is_c(DEFAULT_COLLATION_OID)) \ ++ return is##character_class(TOUCHAR(ptr)); \ ++ char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); \ ++ return isw##character_class((wint_t) character[0]); \ ++} \ ++\ ++/* ptr shall point to a NUL-terminated string */ \ ++int \ ++t_is##character_class##_cstr(const char *ptr) \ ++{ \ ++ return t_is##character_class##_with_len(ptr, pg_mblen_cstr(ptr)); \ ++} \ ++/* ptr shall point to a string with pre-validated encoding */ \ ++int \ ++t_is##character_class##_unbounded(const char *ptr) \ ++{ \ ++ return t_is##character_class##_with_len(ptr, pg_mblen_unbounded(ptr)); \ ++} \ ++/* historical name for _unbounded */ \ ++int \ ++t_is##character_class(const char *ptr) \ ++{ \ ++ return t_is##character_class##_unbounded(ptr); \ + } + ++GENERATE_T_ISCLASS_DEF(alpha) ++GENERATE_T_ISCLASS_DEF(digit) ++GENERATE_T_ISCLASS_DEF(print) ++GENERATE_T_ISCLASS_DEF(space) + + /* + * Set up to read a file using tsearch_readline(). This facility is +diff --git a/src/backend/tsearch/ts_selfuncs.c b/src/backend/tsearch/ts_selfuncs.c +index e74b85a6900..d1c4ca959b0 100644 +--- a/src/backend/tsearch/ts_selfuncs.c ++++ b/src/backend/tsearch/ts_selfuncs.c +@@ -109,12 +109,14 @@ tsmatchsel(PG_FUNCTION_ARGS) + * OK, there's a Var and a Const we're dealing with here. We need the + * Const to be a TSQuery, else we can't do anything useful. We have to + * check this because the Var might be the TSQuery not the TSVector. ++ * ++ * Also check that the Var really is a TSVector, in case this estimator is ++ * mistakenly attached to some other operator. + */ +- if (((Const *) other)->consttype == TSQUERYOID) ++ if (((Const *) other)->consttype == TSQUERYOID && ++ vardata.vartype == TSVECTOROID) + { + /* tsvector @@ tsquery or the other way around */ +- Assert(vardata.vartype == TSVECTOROID); +- + selec = tsquerysel(&vardata, ((Const *) other)->constvalue); + } + else +diff --git a/src/backend/tsearch/ts_utils.c b/src/backend/tsearch/ts_utils.c +index 3bc6b32095f..f6c367ea6a4 100644 +--- a/src/backend/tsearch/ts_utils.c ++++ b/src/backend/tsearch/ts_utils.c +@@ -88,8 +88,8 @@ readstoplist(const char *fname, StopList *s, char *(*wordop) (const char *)) + char *pbuf = line; + + /* Trim trailing space */ +- while (*pbuf && !t_isspace(pbuf)) +- pbuf += pg_mblen(pbuf); ++ while (*pbuf && !t_isspace_cstr(pbuf)) ++ pbuf += pg_mblen_cstr(pbuf); + *pbuf = '\0'; + + /* Skip empty lines */ +diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c +index af97b5b4213..99c5648024f 100644 +--- a/src/backend/tsearch/wparser_def.c ++++ b/src/backend/tsearch/wparser_def.c +@@ -1735,7 +1735,8 @@ TParserGet(TParser *prs) + prs->state->charlen = 0; + else + prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen : +- pg_mblen(prs->str + prs->state->posbyte); ++ pg_mblen_range(prs->str + prs->state->posbyte, ++ prs->str + prs->lenstr); + + Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr); + Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null); +diff --git a/src/backend/utils/adt/arrayfuncs.c b/src/backend/utils/adt/arrayfuncs.c +index 5e9e6809785..bd8defcbf42 100644 +--- a/src/backend/utils/adt/arrayfuncs.c ++++ b/src/backend/utils/adt/arrayfuncs.c +@@ -3357,6 +3357,92 @@ construct_array(Datum *elems, int nelems, + elmtype, elmlen, elmbyval, elmalign); + } + ++/* ++ * Like construct_array(), where elmtype must be a built-in type, and ++ * elmlen/elmbyval/elmalign is looked up from hardcoded data. This is often ++ * useful when manipulating arrays from/for system catalogs. ++ */ ++ArrayType * ++construct_array_builtin(Datum *elems, int nelems, Oid elmtype) ++{ ++ int elmlen; ++ bool elmbyval; ++ char elmalign; ++ ++ switch (elmtype) ++ { ++ case CHAROID: ++ elmlen = 1; ++ elmbyval = true; ++ elmalign = TYPALIGN_CHAR; ++ break; ++ ++ case CSTRINGOID: ++ elmlen = -2; ++ elmbyval = false; ++ elmalign = TYPALIGN_CHAR; ++ break; ++ ++ case FLOAT4OID: ++ elmlen = sizeof(float4); ++ elmbyval = true; ++ elmalign = TYPALIGN_INT; ++ break; ++ ++ case INT2OID: ++ elmlen = sizeof(int16); ++ elmbyval = true; ++ elmalign = TYPALIGN_SHORT; ++ break; ++ ++ case INT4OID: ++ elmlen = sizeof(int32); ++ elmbyval = true; ++ elmalign = TYPALIGN_INT; ++ break; ++ ++ case INT8OID: ++ elmlen = sizeof(int64); ++ elmbyval = FLOAT8PASSBYVAL; ++ elmalign = TYPALIGN_DOUBLE; ++ break; ++ ++ case NAMEOID: ++ elmlen = NAMEDATALEN; ++ elmbyval = false; ++ elmalign = TYPALIGN_CHAR; ++ break; ++ ++ case OIDOID: ++ case REGTYPEOID: ++ elmlen = sizeof(Oid); ++ elmbyval = true; ++ elmalign = TYPALIGN_INT; ++ break; ++ ++ case TEXTOID: ++ elmlen = -1; ++ elmbyval = false; ++ elmalign = TYPALIGN_INT; ++ break; ++ ++ case TIDOID: ++ elmlen = sizeof(ItemPointerData); ++ elmbyval = false; ++ elmalign = TYPALIGN_SHORT; ++ break; ++ ++ default: ++ elog(ERROR, "type %u not supported by construct_array_builtin()", elmtype); ++ /* keep compiler quiet */ ++ elmlen = 0; ++ elmbyval = false; ++ elmalign = 0; ++ } ++ ++ return construct_array(elems, nelems, elmtype, elmlen, elmbyval, elmalign); ++} ++ + /* + * construct_md_array --- simple method for constructing an array object + * with arbitrary dimensions and possible NULLs +@@ -3575,6 +3661,81 @@ deconstruct_array(ArrayType *array, + } + } + ++/* ++ * Like deconstruct_array(), where elmtype must be a built-in type, and ++ * elmlen/elmbyval/elmalign is looked up from hardcoded data. This is often ++ * useful when manipulating arrays from/for system catalogs. ++ */ ++void ++deconstruct_array_builtin(ArrayType *array, ++ Oid elmtype, ++ Datum **elemsp, bool **nullsp, int *nelemsp) ++{ ++ int elmlen; ++ bool elmbyval; ++ char elmalign; ++ ++ switch (elmtype) ++ { ++ case CHAROID: ++ elmlen = 1; ++ elmbyval = true; ++ elmalign = TYPALIGN_CHAR; ++ break; ++ ++ case CSTRINGOID: ++ elmlen = -2; ++ elmbyval = false; ++ elmalign = TYPALIGN_CHAR; ++ break; ++ ++ case FLOAT8OID: ++ elmlen = sizeof(float8); ++ elmbyval = FLOAT8PASSBYVAL; ++ elmalign = TYPALIGN_DOUBLE; ++ break; ++ ++ case INT2OID: ++ elmlen = sizeof(int16); ++ elmbyval = true; ++ elmalign = TYPALIGN_SHORT; ++ break; ++ ++ case INT4OID: ++ elmlen = sizeof(int32); ++ elmbyval = true; ++ elmalign = TYPALIGN_INT; ++ break; ++ ++ case OIDOID: ++ elmlen = sizeof(Oid); ++ elmbyval = true; ++ elmalign = TYPALIGN_INT; ++ break; ++ ++ case TEXTOID: ++ elmlen = -1; ++ elmbyval = false; ++ elmalign = TYPALIGN_INT; ++ break; ++ ++ case TIDOID: ++ elmlen = sizeof(ItemPointerData); ++ elmbyval = false; ++ elmalign = TYPALIGN_SHORT; ++ break; ++ ++ default: ++ elog(ERROR, "type %u not supported by deconstruct_array_builtin()", elmtype); ++ /* keep compiler quiet */ ++ elmlen = 0; ++ elmbyval = false; ++ elmalign = 0; ++ } ++ ++ deconstruct_array(array, elmtype, elmlen, elmbyval, elmalign, elemsp, nullsp, nelemsp); ++} ++ + /* + * array_contains_nulls --- detect whether an array has any null elements + * +diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c +index 61d318d93ca..767efb87dd7 100644 +--- a/src/backend/utils/adt/encode.c ++++ b/src/backend/utils/adt/encode.c +@@ -15,6 +15,7 @@ + + #include + ++#include "mb/pg_wchar.h" + #include "utils/builtins.h" + #include "utils/memutils.h" + +@@ -170,18 +171,42 @@ hex_encode(const char *src, size_t len, char *dst) + return (uint64) len * 2; + } + ++/* ++ * compat version of get_hex without end parameter used in error report ++ */ ++static inline char ++get_hex(const char *cp) ++{ ++ unsigned char c = (unsigned char) *cp; ++ int res = -1; ++ ++ if (c < 127) ++ res = hexlookup[(unsigned char) c]; ++ ++ if (res < 0) ++ ereport(ERROR, ++ (errcode(ERRCODE_INVALID_PARAMETER_VALUE), ++ errmsg("invalid hexadecimal digit", cp))); ++ ++ return (char) res; ++} ++/* ++ * original function from backport using two args ++ */ + static inline char +-get_hex(char c) ++get_hex_new(const char *cp, const char *end) + { ++ unsigned char c = (unsigned char) *cp; + int res = -1; + +- if (c > 0 && c < 127) ++ if (c < 127) + res = hexlookup[(unsigned char) c]; + + if (res < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), +- errmsg("invalid hexadecimal digit: \"%c\"", c))); ++ errmsg("invalid hexadecimal digit: \"%.*s\"", ++ pg_mblen_range(cp, end), cp))); + + return (char) res; + } +@@ -205,13 +230,15 @@ hex_decode(const char *src, size_t len, char *dst) + s++; + continue; + } +- v1 = get_hex(*s++) << 4; ++ v1 = get_hex_new(s, srcend) << 4; ++ s++; + if (s >= srcend) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid hexadecimal data: odd number of digits"))); + +- v2 = get_hex(*s++); ++ v2 = get_hex_new(s, srcend); ++ s++; + *p++ = v1 | v2; + } + +@@ -338,7 +365,8 @@ pg_base64_decode(const char *src, size_t len, char *dst) + if (b < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), +- errmsg("invalid symbol \"%c\" while decoding base64 sequence", (int) c))); ++ errmsg("invalid symbol \"%.*s\" found while decoding base64 sequence", ++ pg_mblen_range(s - 1, srcend), s - 1))); + } + /* add it to buffer */ + buf = (buf << 6) + b; +diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c +index 47bef085fa3..d4f6e3e21ef 100644 +--- a/src/backend/utils/adt/formatting.c ++++ b/src/backend/utils/adt/formatting.c +@@ -1392,7 +1392,7 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw, + ereport(ERROR, + (errcode(ERRCODE_INVALID_DATETIME_FORMAT), + errmsg("invalid datetime format separator: \"%s\"", +- pnstrdup(str, pg_mblen(str))))); ++ pnstrdup(str, pg_mblen_cstr(str))))); + + if (*str == ' ') + n->type = NODE_TYPE_SPACE; +@@ -1422,7 +1422,7 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw, + /* backslash quotes the next character, if any */ + if (*str == '\\' && *(str + 1)) + str++; +- chlen = pg_mblen(str); ++ chlen = pg_mblen_cstr(str); + n->type = NODE_TYPE_CHAR; + memcpy(n->character, str, chlen); + n->character[chlen] = '\0'; +@@ -1440,7 +1440,7 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw, + */ + if (*str == '\\' && *(str + 1) == '"') + str++; +- chlen = pg_mblen(str); ++ chlen = pg_mblen_cstr(str); + + if ((flags & DCH_FLAG) && is_separator_char(str)) + n->type = NODE_TYPE_SEPARATOR; +@@ -2152,8 +2152,8 @@ asc_toupper_z(const char *buff) + do { \ + if (S_THth(_suf)) \ + { \ +- if (*(ptr)) (ptr) += pg_mblen(ptr); \ +- if (*(ptr)) (ptr) += pg_mblen(ptr); \ ++ if (*(ptr)) (ptr) += pg_mblen_cstr(ptr); \ ++ if (*(ptr)) (ptr) += pg_mblen_cstr(ptr); \ + } \ + } while (0) + +@@ -3366,7 +3366,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, + * insist that the consumed character match the format's + * character. + */ +- s += pg_mblen(s); ++ s += pg_mblen_cstr(s); + } + continue; + } +@@ -3388,11 +3388,11 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, + if (extra_skip > 0) + extra_skip--; + else +- s += pg_mblen(s); ++ s += pg_mblen_cstr(s); + } + else + { +- int chlen = pg_mblen(s); ++ int chlen = pg_mblen_cstr(s); + + /* + * Standard mode requires strict match of format characters. +@@ -5564,13 +5564,15 @@ NUM_numpart_to_char(NUMProc *Np, int id) + static void + NUM_eat_non_data_chars(NUMProc *Np, int n, int input_len) + { ++ const char *end = Np->inout + input_len; ++ + while (n-- > 0) + { + if (OVERLOAD_TEST) + break; /* end of input */ + if (strchr("0123456789.,+-", *Np->inout_p) != NULL) + break; /* it's a data character */ +- Np->inout_p += pg_mblen(Np->inout_p); ++ Np->inout_p += pg_mblen_range(Np->inout_p, end); + } + } + +@@ -6027,7 +6029,7 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout, + } + else + { +- Np->inout_p += pg_mblen(Np->inout_p); ++ Np->inout_p += pg_mblen_range(Np->inout_p, Np->inout + input_len); + } + continue; + } +diff --git a/src/backend/utils/adt/jsonfuncs.c b/src/backend/utils/adt/jsonfuncs.c +index 69e8dea1a19..13c9b0e9e96 100644 +--- a/src/backend/utils/adt/jsonfuncs.c ++++ b/src/backend/utils/adt/jsonfuncs.c +@@ -663,7 +663,7 @@ report_json_context(JsonLexContext *lex) + break; + /* Advance to next multibyte character */ + if (IS_HIGHBIT_SET(*context_start)) +- context_start += pg_mblen(context_start); ++ context_start += pg_mblen_range(context_start, context_end); + else + context_start++; + } +diff --git a/src/backend/utils/adt/jsonpath_gram.y b/src/backend/utils/adt/jsonpath_gram.y +index 287d497477c..362027ab80e 100644 +--- a/src/backend/utils/adt/jsonpath_gram.y ++++ b/src/backend/utils/adt/jsonpath_gram.y +@@ -526,8 +526,8 @@ makeItemLikeRegex(JsonPathParseItem *expr, JsonPathString *pattern, + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid input syntax for type %s", "jsonpath"), +- errdetail("unrecognized flag character \"%c\" in LIKE_REGEX predicate", +- flags->val[i]))); ++ errdetail("unrecognized flag character \"%.*s\" in LIKE_REGEX predicate", ++ pg_mblen_range(flags->val + i, flags->val + flags->len), flags->val + i))); + break; + } + } +diff --git a/src/backend/utils/adt/levenshtein.c b/src/backend/utils/adt/levenshtein.c +index d11278c505b..4d656f0af2c 100644 +--- a/src/backend/utils/adt/levenshtein.c ++++ b/src/backend/utils/adt/levenshtein.c +@@ -84,6 +84,8 @@ varstr_levenshtein(const char *source, int slen, + int i, + j; + const char *y; ++ const char *send = source + slen; ++ const char *tend = target + tlen; + + /* + * For varstr_levenshtein_less_equal, we have real variables called +@@ -184,10 +186,10 @@ varstr_levenshtein(const char *source, int slen, + #endif + + /* +- * In order to avoid calling pg_mblen() repeatedly on each character in s, +- * we cache all the lengths before starting the main loop -- but if all +- * the characters in both strings are single byte, then we skip this and +- * use a fast-path in the main loop. If only one string contains ++ * In order to avoid calling pg_mblen_range() repeatedly on each character ++ * in s, we cache all the lengths before starting the main loop -- but if ++ * all the characters in both strings are single byte, then we skip this ++ * and use a fast-path in the main loop. If only one string contains + * multi-byte characters, we still build the array, so that the fast-path + * needn't deal with the case where the array hasn't been initialized. + */ +@@ -199,7 +201,7 @@ varstr_levenshtein(const char *source, int slen, + s_char_len = (int *) palloc((m + 1) * sizeof(int)); + for (i = 0; i < m; ++i) + { +- s_char_len[i] = pg_mblen(cp); ++ s_char_len[i] = pg_mblen_range(cp, send); + cp += s_char_len[i]; + } + s_char_len[i] = 0; +@@ -225,7 +227,7 @@ varstr_levenshtein(const char *source, int slen, + { + int *temp; + const char *x = source; +- int y_char_len = n != tlen + 1 ? pg_mblen(y) : 1; ++ int y_char_len = n != tlen + 1 ? pg_mblen_range(y, tend) : 1; + + #ifdef LEVENSHTEIN_LESS_EQUAL + +diff --git a/src/backend/utils/adt/like.c b/src/backend/utils/adt/like.c +index 5bf94628c30..dde8ec2097d 100644 +--- a/src/backend/utils/adt/like.c ++++ b/src/backend/utils/adt/like.c +@@ -54,20 +54,20 @@ static int Generic_Text_IC_like(text *str, text *pat, Oid collation); + *-------------------- + */ + static inline int +-wchareq(const char *p1, const char *p2) ++wchareq(const char *p1, int p1len, const char *p2, int p2len) + { +- int p1_len; ++ int p1clen; + + /* Optimization: quickly compare the first byte. */ + if (*p1 != *p2) + return 0; + +- p1_len = pg_mblen(p1); +- if (pg_mblen(p2) != p1_len) ++ p1clen = pg_mblen_with_len(p1, p1len); ++ if (pg_mblen_with_len(p2, p2len) != p1clen) + return 0; + + /* They are the same length */ +- while (p1_len--) ++ while (p1clen--) + { + if (*p1++ != *p2++) + return 0; +@@ -106,11 +106,11 @@ SB_lower_char(unsigned char c, pg_locale_t locale, bool locale_is_c) + #define NextByte(p, plen) ((p)++, (plen)--) + + /* Set up to compile like_match.c for multibyte characters */ +-#define CHAREQ(p1, p2) wchareq((p1), (p2)) ++#define CHAREQ(p1, p1len, p2, p2len) wchareq((p1), (p1len), (p2), (p2len)) + #define NextChar(p, plen) \ +- do { int __l = pg_mblen(p); (p) +=__l; (plen) -=__l; } while (0) ++ do { int __l = pg_mblen_with_len((p), (plen)); (p) +=__l; (plen) -=__l; } while (0) + #define CopyAdvChar(dst, src, srclen) \ +- do { int __l = pg_mblen(src); \ ++ do { int __l = pg_mblen_with_len((src), (srclen)); \ + (srclen) -= __l; \ + while (__l-- > 0) \ + *(dst)++ = *(src)++; \ +@@ -122,7 +122,7 @@ SB_lower_char(unsigned char c, pg_locale_t locale, bool locale_is_c) + #include "like_match.c" + + /* Set up to compile like_match.c for single-byte characters */ +-#define CHAREQ(p1, p2) (*(p1) == *(p2)) ++#define CHAREQ(p1, p1len, p2, p2len) (*(p1) == *(p2)) + #define NextChar(p, plen) NextByte((p), (plen)) + #define CopyAdvChar(dst, src, srclen) (*(dst)++ = *(src)++, (srclen)--) + +diff --git a/src/backend/utils/adt/like_match.c b/src/backend/utils/adt/like_match.c +index ee30170fbb4..9a93744aff6 100644 +--- a/src/backend/utils/adt/like_match.c ++++ b/src/backend/utils/adt/like_match.c +@@ -294,6 +294,7 @@ do_like_escape(text *pat, text *esc) + errhint("Escape string must be empty or one character."))); + + e = VARDATA_ANY(esc); ++ elen = VARSIZE_ANY_EXHDR(esc); + + /* + * If specified escape is '\', just copy the pattern as-is. +@@ -312,7 +313,7 @@ do_like_escape(text *pat, text *esc) + afterescape = false; + while (plen > 0) + { +- if (CHAREQ(p, e) && !afterescape) ++ if (CHAREQ(p, plen, e, elen) && !afterescape) + { + *r++ = '\\'; + NextChar(p, plen); +diff --git a/src/backend/utils/adt/network_selfuncs.c b/src/backend/utils/adt/network_selfuncs.c +index 955e0ee87f8..9a08dea351d 100644 +--- a/src/backend/utils/adt/network_selfuncs.c ++++ b/src/backend/utils/adt/network_selfuncs.c +@@ -43,9 +43,9 @@ + /* Maximum number of items to consider in join selectivity calculations */ + #define MAX_CONSIDERED_ELEMS 1024 + +-static Selectivity networkjoinsel_inner(Oid operator, ++static Selectivity networkjoinsel_inner(Oid operator, int opr_codenum, + VariableStatData *vardata1, VariableStatData *vardata2); +-static Selectivity networkjoinsel_semi(Oid operator, ++static Selectivity networkjoinsel_semi(Oid operator, int opr_codenum, + VariableStatData *vardata1, VariableStatData *vardata2); + static Selectivity mcv_population(float4 *mcv_numbers, int mcv_nvalues); + static Selectivity inet_hist_value_sel(Datum *values, int nvalues, +@@ -82,6 +82,7 @@ networksel(PG_FUNCTION_ARGS) + Oid operator = PG_GETARG_OID(1); + List *args = (List *) PG_GETARG_POINTER(2); + int varRelid = PG_GETARG_INT32(3); ++ int opr_codenum; + VariableStatData vardata; + Node *other; + bool varonleft; +@@ -95,6 +96,14 @@ networksel(PG_FUNCTION_ARGS) + nullfrac; + FmgrInfo proc; + ++ /* ++ * Before all else, verify that the operator is one of the ones supported ++ * by this function, which in turn proves that the input datatypes are ++ * what we expect. Otherwise, attaching this selectivity function to some ++ * unexpected operator could cause trouble. ++ */ ++ opr_codenum = inet_opr_codenum(operator); ++ + /* + * If expression is not (variable op something) or (something op + * variable), then punt and return a default estimate. +@@ -150,13 +159,12 @@ networksel(PG_FUNCTION_ARGS) + STATISTIC_KIND_HISTOGRAM, InvalidOid, + ATTSTATSSLOT_VALUES)) + { +- int opr_codenum = inet_opr_codenum(operator); ++ int h_codenum; + + /* Commute if needed, so we can consider histogram to be on the left */ +- if (!varonleft) +- opr_codenum = -opr_codenum; ++ h_codenum = varonleft ? opr_codenum : -opr_codenum; + non_mcv_selec = inet_hist_value_sel(hslot.values, hslot.nvalues, +- constvalue, opr_codenum); ++ constvalue, h_codenum); + + free_attstatsslot(&hslot); + } +@@ -203,10 +211,19 @@ networkjoinsel(PG_FUNCTION_ARGS) + #endif + SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) PG_GETARG_POINTER(4); + double selec; ++ int opr_codenum; + VariableStatData vardata1; + VariableStatData vardata2; + bool join_is_reversed; + ++ /* ++ * Before all else, verify that the operator is one of the ones supported ++ * by this function, which in turn proves that the input datatypes are ++ * what we expect. Otherwise, attaching this selectivity function to some ++ * unexpected operator could cause trouble. ++ */ ++ opr_codenum = inet_opr_codenum(operator); ++ + get_join_variables(root, args, sjinfo, + &vardata1, &vardata2, &join_is_reversed); + +@@ -220,15 +237,18 @@ networkjoinsel(PG_FUNCTION_ARGS) + * Selectivity for left/full join is not exactly the same as inner + * join, but we neglect the difference, as eqjoinsel does. + */ +- selec = networkjoinsel_inner(operator, &vardata1, &vardata2); ++ selec = networkjoinsel_inner(operator, opr_codenum, ++ &vardata1, &vardata2); + break; + case JOIN_SEMI: + case JOIN_ANTI: + /* Here, it's important that we pass the outer var on the left. */ + if (!join_is_reversed) +- selec = networkjoinsel_semi(operator, &vardata1, &vardata2); ++ selec = networkjoinsel_semi(operator, opr_codenum, ++ &vardata1, &vardata2); + else + selec = networkjoinsel_semi(get_commutator(operator), ++ -opr_codenum, + &vardata2, &vardata1); + break; + default: +@@ -260,7 +280,7 @@ networkjoinsel(PG_FUNCTION_ARGS) + * Also, MCV vs histogram selectivity is not neglected as in eqjoinsel_inner(). + */ + static Selectivity +-networkjoinsel_inner(Oid operator, ++networkjoinsel_inner(Oid operator, int opr_codenum, + VariableStatData *vardata1, VariableStatData *vardata2) + { + Form_pg_statistic stats; +@@ -273,7 +293,6 @@ networkjoinsel_inner(Oid operator, + mcv2_exists = false, + hist1_exists = false, + hist2_exists = false; +- int opr_codenum; + int mcv1_length = 0, + mcv2_length = 0; + AttStatsSlot mcv1_slot; +@@ -325,8 +344,6 @@ networkjoinsel_inner(Oid operator, + memset(&hist2_slot, 0, sizeof(hist2_slot)); + } + +- opr_codenum = inet_opr_codenum(operator); +- + /* + * Calculate selectivity for MCV vs MCV matches. + */ +@@ -387,7 +404,7 @@ networkjoinsel_inner(Oid operator, + * histogram selectivity for semi/anti join cases. + */ + static Selectivity +-networkjoinsel_semi(Oid operator, ++networkjoinsel_semi(Oid operator, int opr_codenum, + VariableStatData *vardata1, VariableStatData *vardata2) + { + Form_pg_statistic stats; +@@ -401,7 +418,6 @@ networkjoinsel_semi(Oid operator, + mcv2_exists = false, + hist1_exists = false, + hist2_exists = false; +- int opr_codenum; + FmgrInfo proc; + int i, + mcv1_length = 0, +@@ -455,7 +471,6 @@ networkjoinsel_semi(Oid operator, + memset(&hist2_slot, 0, sizeof(hist2_slot)); + } + +- opr_codenum = inet_opr_codenum(operator); + fmgr_info(get_opcode(operator), &proc); + + /* Estimate number of input rows represented by RHS histogram. */ +@@ -827,6 +842,9 @@ inet_semi_join_sel(Datum lhs_value, + /* + * Assign useful code numbers for the subnet inclusion/overlap operators + * ++ * This will throw an error if the operator is not one of the ones we ++ * support in networksel() and networkjoinsel(). ++ * + * Only inet_masklen_inclusion_cmp() and inet_hist_match_divider() depend + * on the exact codes assigned here; but many other places in this file + * know that they can negate a code to obtain the code for the commutator +diff --git a/src/backend/utils/adt/oracle_compat.c b/src/backend/utils/adt/oracle_compat.c +index 60d2d4cc768..4427a9a0829 100644 +--- a/src/backend/utils/adt/oracle_compat.c ++++ b/src/backend/utils/adt/oracle_compat.c +@@ -148,8 +148,8 @@ lpad(PG_FUNCTION_ARGS) + char *ptr1, + *ptr2, + *ptr2start, +- *ptr2end, + *ptr_ret; ++ const char *ptr2end; + int m, + s1len, + s2len; +@@ -194,7 +194,7 @@ lpad(PG_FUNCTION_ARGS) + + while (m--) + { +- int mlen = pg_mblen(ptr2); ++ int mlen = pg_mblen_range(ptr2, ptr2end); + + memcpy(ptr_ret, ptr2, mlen); + ptr_ret += mlen; +@@ -207,7 +207,7 @@ lpad(PG_FUNCTION_ARGS) + + while (s1len--) + { +- int mlen = pg_mblen(ptr1); ++ int mlen = pg_mblen_unbounded(ptr1); + + memcpy(ptr_ret, ptr1, mlen); + ptr_ret += mlen; +@@ -246,8 +246,8 @@ rpad(PG_FUNCTION_ARGS) + char *ptr1, + *ptr2, + *ptr2start, +- *ptr2end, + *ptr_ret; ++ const char *ptr2end; + int m, + s1len, + s2len; +@@ -286,11 +286,12 @@ rpad(PG_FUNCTION_ARGS) + m = len - s1len; + + ptr1 = VARDATA_ANY(string1); ++ + ptr_ret = VARDATA(ret); + + while (s1len--) + { +- int mlen = pg_mblen(ptr1); ++ int mlen = pg_mblen_unbounded(ptr1); + + memcpy(ptr_ret, ptr1, mlen); + ptr_ret += mlen; +@@ -302,7 +303,7 @@ rpad(PG_FUNCTION_ARGS) + + while (m--) + { +- int mlen = pg_mblen(ptr2); ++ int mlen = pg_mblen_range(ptr2, ptr2end); + + memcpy(ptr_ret, ptr2, mlen); + ptr_ret += mlen; +@@ -387,6 +388,7 @@ dotrim(const char *string, int stringlen, + */ + const char **stringchars; + const char **setchars; ++ const char *setend; + int *stringmblen; + int *setmblen; + int stringnchars; +@@ -394,6 +396,7 @@ dotrim(const char *string, int stringlen, + int resultndx; + int resultnchars; + const char *p; ++ const char *pend; + int len; + int mblen; + const char *str_pos; +@@ -404,10 +407,11 @@ dotrim(const char *string, int stringlen, + stringnchars = 0; + p = string; + len = stringlen; ++ pend = p + len; + while (len > 0) + { + stringchars[stringnchars] = p; +- stringmblen[stringnchars] = mblen = pg_mblen(p); ++ stringmblen[stringnchars] = mblen = pg_mblen_range(p, pend); + stringnchars++; + p += mblen; + len -= mblen; +@@ -418,10 +422,11 @@ dotrim(const char *string, int stringlen, + setnchars = 0; + p = set; + len = setlen; ++ setend = set + setlen; + while (len > 0) + { + setchars[setnchars] = p; +- setmblen[setnchars] = mblen = pg_mblen(p); ++ setmblen[setnchars] = mblen = pg_mblen_range(p, setend); + setnchars++; + p += mblen; + len -= mblen; +@@ -727,6 +732,8 @@ translate(PG_FUNCTION_ARGS) + *to_end; + char *source, + *target; ++ const char *source_end; ++ const char *from_end; + int m, + fromlen, + tolen, +@@ -741,9 +748,11 @@ translate(PG_FUNCTION_ARGS) + if (m <= 0) + PG_RETURN_TEXT_P(string); + source = VARDATA_ANY(string); ++ source_end = source + m; + + fromlen = VARSIZE_ANY_EXHDR(from); + from_ptr = VARDATA_ANY(from); ++ from_end = from_ptr + fromlen; + tolen = VARSIZE_ANY_EXHDR(to); + to_ptr = VARDATA_ANY(to); + to_end = to_ptr + tolen; +@@ -766,12 +775,12 @@ translate(PG_FUNCTION_ARGS) + + while (m > 0) + { +- source_len = pg_mblen(source); ++ source_len = pg_mblen_range(source, source_end); + from_index = 0; + + for (i = 0; i < fromlen; i += len) + { +- len = pg_mblen(&from_ptr[i]); ++ len = pg_mblen_range(&from_ptr[i], from_end); + if (len == source_len && + memcmp(source, &from_ptr[i], len) == 0) + break; +@@ -787,11 +796,11 @@ translate(PG_FUNCTION_ARGS) + { + if (p >= to_end) + break; +- p += pg_mblen(p); ++ p += pg_mblen_range(p, to_end); + } + if (p < to_end) + { +- len = pg_mblen(p); ++ len = pg_mblen_range(p, to_end); + memcpy(target, p, len); + target += len; + retlen += len; +diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c +index 1b2a0434ae2..fc567ba14d4 100644 +--- a/src/backend/utils/adt/regexp.c ++++ b/src/backend/utils/adt/regexp.c +@@ -423,8 +423,8 @@ parse_re_flags(pg_re_flags *flags, text *opts) + default: + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), +- errmsg("invalid regular expression option: \"%c\"", +- opt_p[i]))); ++ errmsg("invalid regular expression option: \"%.*s\"", ++ pg_mblen_range(opt_p + i, opt_p + opt_len), opt_p + i))); + break; + } + } +@@ -672,6 +672,7 @@ similar_escape_internal(text *pat_text, text *esc_text) + *r; + int plen, + elen; ++ const char *pend; + bool afterescape = false; + int nquotes = 0; + int bracket_depth = 0; /* square bracket nesting level */ +@@ -679,6 +680,7 @@ similar_escape_internal(text *pat_text, text *esc_text) + + p = VARDATA_ANY(pat_text); + plen = VARSIZE_ANY_EXHDR(pat_text); ++ pend = p + plen; + if (esc_text == NULL) + { + /* No ESCAPE clause provided; default to backslash as escape */ +@@ -778,7 +780,7 @@ similar_escape_internal(text *pat_text, text *esc_text) + + if (elen > 1) + { +- int mblen = pg_mblen(p); ++ int mblen = pg_mblen_range(p, pend); + + if (mblen > 1) + { +diff --git a/src/backend/utils/adt/tsquery.c b/src/backend/utils/adt/tsquery.c +index dc373cee860..c5e16f5f96b 100644 +--- a/src/backend/utils/adt/tsquery.c ++++ b/src/backend/utils/adt/tsquery.c +@@ -110,7 +110,7 @@ get_modifiers(char *buf, int16 *weight, bool *prefix) + return buf; + + buf++; +- while (*buf && pg_mblen(buf) == 1) ++ while (*buf && pg_mblen_cstr(buf) == 1) + { + switch (*buf) + { +@@ -187,7 +187,7 @@ parse_phrase_operator(TSQueryParserState pstate, int16 *distance) + continue; + } + +- if (!t_isdigit(ptr)) ++ if (!t_isdigit_cstr(ptr)) + return false; + + errno = 0; +@@ -252,12 +252,12 @@ parse_or_operator(TSQueryParserState pstate) + return false; + + /* it shouldn't be a part of any word */ +- if (t_iseq(ptr, '-') || t_iseq(ptr, '_') || t_isalpha(ptr) || t_isdigit(ptr)) ++ if (t_iseq(ptr, '-') || t_iseq(ptr, '_') || t_isalpha_cstr(ptr) || t_isdigit_cstr(ptr)) + return false; + + for (;;) + { +- ptr += pg_mblen(ptr); ++ ptr += pg_mblen_cstr(ptr); + + if (*ptr == '\0') /* got end of string without operand */ + return false; +@@ -267,7 +267,7 @@ parse_or_operator(TSQueryParserState pstate) + * So we still treat OR literal as operation with possibly incorrect + * operand and will not search it as lexeme + */ +- if (!t_isspace(ptr)) ++ if (!t_isspace_cstr(ptr)) + break; + } + +@@ -310,7 +310,7 @@ gettoken_query_standard(TSQueryParserState state, int8 *operator, + errmsg("syntax error in tsquery: \"%s\"", + state->buffer))); + } +- else if (!t_isspace(state->buf)) ++ else if (!t_isspace_cstr(state->buf)) + { + /* + * We rely on the tsvector parser to parse the value for +@@ -368,14 +368,14 @@ gettoken_query_standard(TSQueryParserState state, int8 *operator, + { + return (state->count) ? PT_ERR : PT_END; + } +- else if (!t_isspace(state->buf)) ++ else if (!t_isspace_cstr(state->buf)) + { + return PT_ERR; + } + break; + } + +- state->buf += pg_mblen(state->buf); ++ state->buf += pg_mblen_cstr(state->buf); + } + } + +@@ -438,7 +438,7 @@ gettoken_query_websearch(TSQueryParserState state, int8 *operator, + state->state = WAITOPERAND; + continue; + } +- else if (!t_isspace(state->buf)) ++ else if (!t_isspace_cstr(state->buf)) + { + /* + * We rely on the tsvector parser to parse the value for +@@ -497,13 +497,13 @@ gettoken_query_websearch(TSQueryParserState state, int8 *operator, + /* ignore other operators in this state too */ + state->buf++; + continue; +- } ++ } /* backport starts here */ + else if (*state->buf == '\0') + { + return PT_END; + } +- else if (!t_isspace(state->buf)) +- { ++ else if (!t_isspace_cstr(state->buf)) ++ {/* backport end */ + if (state->in_quotes) + { + /* put implicit <-> after an operand */ +@@ -522,7 +522,7 @@ gettoken_query_websearch(TSQueryParserState state, int8 *operator, + break; + } + +- state->buf += pg_mblen(state->buf); ++ state->buf += pg_mblen_cstr(state->buf); + } + } + +@@ -1006,9 +1006,8 @@ infix(INFIX *in, int parentPriority, bool rightPhraseOp) + *(in->cur) = '\\'; + in->cur++; + } +- COPYCHAR(in->cur, op); + +- clen = pg_mblen(op); ++ clen = ts_copychar_cstr(in->cur, op); + op += clen; + in->cur += clen; + } +diff --git a/src/backend/utils/adt/tsvector.c b/src/backend/utils/adt/tsvector.c +index 8972f419abc..8b8f4e160eb 100644 +--- a/src/backend/utils/adt/tsvector.c ++++ b/src/backend/utils/adt/tsvector.c +@@ -313,9 +313,9 @@ tsvectorout(PG_FUNCTION_ARGS) + lenbuf = 0, + pp; + WordEntry *ptr = ARRPTR(out); +- char *curbegin, +- *curin, ++ char *curin, + *curout; ++ const char *curend; + + lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ; + for (i = 0; i < out->size; i++) +@@ -328,13 +328,14 @@ tsvectorout(PG_FUNCTION_ARGS) + curout = outbuf = (char *) palloc(lenbuf); + for (i = 0; i < out->size; i++) + { +- curbegin = curin = STRPTR(out) + ptr->pos; ++ curin = STRPTR(out) + ptr->pos; ++ curend = curin + ptr->len; + if (i != 0) + *curout++ = ' '; + *curout++ = '\''; +- while (curin - curbegin < ptr->len) ++ while (curin < curend) + { +- int len = pg_mblen(curin); ++ int len = pg_mblen_range(curin, curend); + + if (t_iseq(curin, '\'')) + *curout++ = '\''; +diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c +index cc3e3c15054..b559916f84c 100644 +--- a/src/backend/utils/adt/tsvector_op.c ++++ b/src/backend/utils/adt/tsvector_op.c +@@ -2434,11 +2434,15 @@ ts_stat_sql(MemoryContext persistentContext, text *txt, text *ws) + if (ws) + { + char *buf; ++ const char *end; + + buf = VARDATA_ANY(ws); +- while (buf - VARDATA_ANY(ws) < VARSIZE_ANY_EXHDR(ws)) ++ end = buf + VARSIZE_ANY_EXHDR(ws); ++ while (buf < end) + { +- if (pg_mblen(buf) == 1) ++ int len = pg_mblen_range(buf, end); ++ ++ if (len == 1) + { + switch (*buf) + { +@@ -2462,7 +2466,7 @@ ts_stat_sql(MemoryContext persistentContext, text *txt, text *ws) + stat->weight |= 0; + } + } +- buf += pg_mblen(buf); ++ buf += len; + } + } + +diff --git a/src/backend/utils/adt/tsvector_parser.c b/src/backend/utils/adt/tsvector_parser.c +index cfc181c20df..3567f0f9f43 100644 +--- a/src/backend/utils/adt/tsvector_parser.c ++++ b/src/backend/utils/adt/tsvector_parser.c +@@ -185,10 +185,9 @@ gettoken_tsvector(TSVectorParseState state, + else if ((state->oprisdelim && ISOPERATOR(state->prsbuf)) || + (state->is_web && t_iseq(state->prsbuf, '"'))) + PRSSYNTAXERROR; +- else if (!t_isspace(state->prsbuf)) ++ else if (!t_isspace_cstr(state->prsbuf)) + { +- COPYCHAR(curpos, state->prsbuf); +- curpos += pg_mblen(state->prsbuf); ++ curpos += ts_copychar_cstr(curpos, state->prsbuf); + statecode = WAITENDWORD; + } + } +@@ -202,8 +201,7 @@ gettoken_tsvector(TSVectorParseState state, + else + { + RESIZEPRSBUF; +- COPYCHAR(curpos, state->prsbuf); +- curpos += pg_mblen(state->prsbuf); ++ curpos += ts_copychar_cstr(curpos, state->prsbuf); + Assert(oldstate != 0); + statecode = oldstate; + } +@@ -215,7 +213,7 @@ gettoken_tsvector(TSVectorParseState state, + statecode = WAITNEXTCHAR; + oldstate = WAITENDWORD; + } +- else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' || ++ else if (t_isspace_cstr(state->prsbuf) || *(state->prsbuf) == '\0' || + (state->oprisdelim && ISOPERATOR(state->prsbuf)) || + (state->is_web && t_iseq(state->prsbuf, '"'))) + { +@@ -238,8 +236,7 @@ gettoken_tsvector(TSVectorParseState state, + else + { + RESIZEPRSBUF; +- COPYCHAR(curpos, state->prsbuf); +- curpos += pg_mblen(state->prsbuf); ++ curpos += ts_copychar_cstr(curpos, state->prsbuf); + } + } + else if (statecode == WAITENDCMPLX) +@@ -258,8 +255,7 @@ gettoken_tsvector(TSVectorParseState state, + else + { + RESIZEPRSBUF; +- COPYCHAR(curpos, state->prsbuf); +- curpos += pg_mblen(state->prsbuf); ++ curpos += ts_copychar_cstr(curpos, state->prsbuf); + } + } + else if (statecode == WAITCHARCMPLX) +@@ -267,8 +263,7 @@ gettoken_tsvector(TSVectorParseState state, + if (!state->is_web && t_iseq(state->prsbuf, '\'')) + { + RESIZEPRSBUF; +- COPYCHAR(curpos, state->prsbuf); +- curpos += pg_mblen(state->prsbuf); ++ curpos += ts_copychar_cstr(curpos, state->prsbuf); + statecode = WAITENDCMPLX; + } + else +@@ -279,7 +274,7 @@ gettoken_tsvector(TSVectorParseState state, + PRSSYNTAXERROR; + if (state->oprisdelim) + { +- /* state->prsbuf+=pg_mblen(state->prsbuf); */ ++ /* state->prsbuf+=pg_mblen_cstr(state->prsbuf); */ + RETURN_TOKEN; + } + else +@@ -296,7 +291,7 @@ gettoken_tsvector(TSVectorParseState state, + } + else if (statecode == INPOSINFO) + { +- if (t_isdigit(state->prsbuf)) ++ if (t_isdigit_cstr(state->prsbuf)) + { + if (posalen == 0) + { +@@ -351,10 +346,10 @@ gettoken_tsvector(TSVectorParseState state, + PRSSYNTAXERROR; + WEP_SETWEIGHT(pos[npos - 1], 0); + } +- else if (t_isspace(state->prsbuf) || ++ else if (t_isspace_cstr(state->prsbuf) || + *(state->prsbuf) == '\0') + RETURN_TOKEN; +- else if (!t_isdigit(state->prsbuf)) ++ else if (!t_isdigit_cstr(state->prsbuf)) + PRSSYNTAXERROR; + } + else /* internal error */ +@@ -362,6 +357,6 @@ gettoken_tsvector(TSVectorParseState state, + statecode); + + /* get next char */ +- state->prsbuf += pg_mblen(state->prsbuf); ++ state->prsbuf += pg_mblen_cstr(state->prsbuf); + } + } +diff --git a/src/backend/utils/adt/varbit.c b/src/backend/utils/adt/varbit.c +index de3852045b1..78e4615501b 100644 +--- a/src/backend/utils/adt/varbit.c ++++ b/src/backend/utils/adt/varbit.c +@@ -230,8 +230,8 @@ bit_in(PG_FUNCTION_ARGS) + else if (*sp != '0') + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), +- errmsg("\"%c\" is not a valid binary digit", +- *sp))); ++ errmsg("\"%.*s\" is not a valid binary digit", ++ pg_mblen_cstr(sp), sp))); + + x >>= 1; + if (x == 0) +@@ -255,8 +255,8 @@ bit_in(PG_FUNCTION_ARGS) + else + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), +- errmsg("\"%c\" is not a valid hexadecimal digit", +- *sp))); ++ errmsg("\"%.*s\" is not a valid hexadecimal digit", ++ pg_mblen_cstr(sp), sp))); + + if (bc) + { +@@ -531,8 +531,8 @@ varbit_in(PG_FUNCTION_ARGS) + else if (*sp != '0') + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), +- errmsg("\"%c\" is not a valid binary digit", +- *sp))); ++ errmsg("\"%.*s\" is not a valid binary digit", ++ pg_mblen_cstr(sp), sp))); + + x >>= 1; + if (x == 0) +@@ -556,8 +556,8 @@ varbit_in(PG_FUNCTION_ARGS) + else + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), +- errmsg("\"%c\" is not a valid hexadecimal digit", +- *sp))); ++ errmsg("\"%.*s\" is not a valid hexadecimal digit", ++ pg_mblen_cstr(sp), sp))); + + if (bc) + { +diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c +index 9dea2a5494d..437d84c9241 100644 +--- a/src/backend/utils/adt/varlena.c ++++ b/src/backend/utils/adt/varlena.c +@@ -122,6 +122,7 @@ static text *text_substring(Datum str, + int32 start, + int32 length, + bool length_not_specified); ++static int pg_mbcharcliplen_chars(const char *mbstr, int len, int limit); + static text *text_overlay(text *t1, text *t2, int sp, int sl); + static int text_position(text *t1, text *t2, Oid collid); + static void text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state); +@@ -762,8 +763,11 @@ text_catenate(text *t1, text *t2) + * charlen_to_bytelen() + * Compute the number of bytes occupied by n characters starting at *p + * +- * It is caller's responsibility that there actually are n characters; +- * the string need not be null-terminated. ++ * The caller shall ensure there are n complete characters. Callers achieve ++ * this by deriving "n" from regmatch_t findings from searching a wchar array. ++ * pg_mb2wchar_with_len() skips any trailing incomplete character, so regex ++ * matches will end no later than the last complete character. (The string ++ * need not be null-terminated.) + */ + static int + charlen_to_bytelen(const char *p, int n) +@@ -778,7 +782,7 @@ charlen_to_bytelen(const char *p, int n) + const char *s; + + for (s = p; n > 0; n--) +- s += pg_mblen(s); ++ s += pg_mblen_unbounded(s); /* caller verified encoding */ + + return s - p; + } +@@ -851,7 +855,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) + int32 S = start; /* start position */ + int32 S1; /* adjusted start position */ + int32 L1; /* adjusted substring length */ +- int32 E; /* end position */ ++ int32 E; /* end position, exclusive */ + + /* + * SQL99 says S can be zero or negative, but we still must fetch from the +@@ -911,6 +915,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) + int32 slice_start; + int32 slice_size; + int32 slice_strlen; ++ int32 slice_len; + text *slice; + int32 E1; + int32 i; +@@ -947,11 +952,11 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) + else + { + /* +- * A zero or negative value for the end position can happen if the +- * start was negative or one. SQL99 says to return a zero-length +- * string. ++ * Ending at position 1, exclusive, obviously yields an empty ++ * string. A zero or negative value can happen if the start was ++ * negative or one. SQL99 says to return a zero-length string. + */ +- if (E < 1) ++ if (E <= 1) + return cstring_to_text(""); + + /* +@@ -961,11 +966,11 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) + L1 = E - S1; + + /* +- * Total slice size in bytes can't be any longer than the start +- * position plus substring length times the encoding max length. +- * If that overflows, we can just use -1. ++ * Total slice size in bytes can't be any longer than the ++ * inclusive end position times the encoding max length. If that ++ * overflows, we can just use -1. + */ +- if (pg_mul_s32_overflow(E, eml, &slice_size)) ++ if (pg_mul_s32_overflow(E - 1, eml, &slice_size)) + slice_size = -1; + } + +@@ -980,16 +985,25 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) + slice = (text *) DatumGetPointer(str); + + /* see if we got back an empty string */ +- if (VARSIZE_ANY_EXHDR(slice) == 0) ++ slice_len = VARSIZE_ANY_EXHDR(slice); ++ if (slice_len == 0) + { + if (slice != (text *) DatumGetPointer(str)) + pfree(slice); + return cstring_to_text(""); + } + +- /* Now we can get the actual length of the slice in MB characters */ +- slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice), +- VARSIZE_ANY_EXHDR(slice)); ++ /* ++ * Now we can get the actual length of the slice in MB characters, ++ * stopping at the end of the substring. Continuing beyond the ++ * substring end could find an incomplete character attributable ++ * solely to DatumGetTextPSlice() chopping in the middle of a ++ * character, and it would be superfluous work at best. ++ */ ++ slice_strlen = ++ (slice_size == -1 ? ++ pg_mbstrlen_with_len(VARDATA_ANY(slice), slice_len) : ++ pg_mbcharcliplen_chars(VARDATA_ANY(slice), slice_len, E - 1)); + + /* + * Check that the start position wasn't > slice_strlen. If so, SQL99 +@@ -1016,7 +1030,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) + */ + p = VARDATA_ANY(slice); + for (i = 0; i < S1 - 1; i++) +- p += pg_mblen(p); ++ p += pg_mblen_unbounded(p); + + /* hang onto a pointer to our start position */ + s = p; +@@ -1026,7 +1040,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) + * length. + */ + for (i = S1; i < E1; i++) +- p += pg_mblen(p); ++ p += pg_mblen_unbounded(p); + + ret = (text *) palloc(VARHDRSZ + (p - s)); + SET_VARSIZE(ret, VARHDRSZ + (p - s)); +@@ -1044,6 +1058,35 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) + return NULL; + } + ++/* ++ * pg_mbcharcliplen_chars - ++ * Mirror pg_mbcharcliplen(), except return value unit is chars, not bytes. ++ * ++ * This mirrors all the dubious historical behavior, so it's static to ++ * discourage proliferation. The assertions are specific to the one caller. ++ */ ++static int ++pg_mbcharcliplen_chars(const char *mbstr, int len, int limit) ++{ ++ int nch = 0; ++ int l; ++ ++ Assert(len > 0); ++ Assert(limit > 0); ++ Assert(pg_database_encoding_max_length() > 1); ++ ++ while (len > 0 && *mbstr) ++ { ++ l = pg_mblen_with_len(mbstr, len); ++ nch++; ++ if (nch == limit) ++ break; ++ len -= l; ++ mbstr += l; ++ } ++ return nch; ++} ++ + /* + * textoverlay + * Replace specified substring of first string with second +@@ -1333,6 +1376,8 @@ retry: + */ + if (state->is_multibyte_char_in_char) + { ++ const char *haystack_end = state->str1 + state->len1; ++ + /* Walk one character at a time, until we reach the match. */ + + /* the search should never move backwards. */ +@@ -1341,7 +1386,7 @@ retry: + while (state->refpoint < matchptr) + { + /* step to next character. */ +- state->refpoint += pg_mblen(state->refpoint); ++ state->refpoint += pg_mblen_range(state->refpoint, haystack_end); + state->refpos++; + + /* +@@ -1457,7 +1502,8 @@ text_position_get_match_pos(TextPositionState *state) + /* Convert the byte position to char position. */ + while (state->refpoint < state->last_match) + { +- state->refpoint += pg_mblen(state->refpoint); ++ state->refpoint += pg_mblen_range(state->refpoint, ++ state->last_match); + state->refpos++; + } + Assert(state->refpoint == state->last_match); +@@ -4326,7 +4372,7 @@ check_replace_text_has_escape_char(const text *replace_text) + } + else + { +- for (; p < p_end; p += pg_mblen(p)) ++ for (; p < p_end; p += pg_mblen_range(p, p_end)) + { + if (*p == '\\') + return true; +@@ -4366,7 +4412,7 @@ appendStringInfoRegexpSubstr(StringInfo str, text *replace_text, + } + else + { +- for (; p < p_end && *p != '\\'; p += pg_mblen(p)) ++ for (; p < p_end && *p != '\\'; p += pg_mblen_range(p, p_end)) + /* nothing */ ; + } + +@@ -4834,6 +4880,8 @@ text_to_array_internal(PG_FUNCTION_ARGS) + } + else + { ++ const char *end_ptr; ++ + /* + * When fldsep is NULL, each character in the inputstring becomes an + * element in the result array. The separator is effectively the +@@ -4846,10 +4894,11 @@ text_to_array_internal(PG_FUNCTION_ARGS) + PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID)); + + start_ptr = VARDATA_ANY(inputstring); ++ end_ptr = start_ptr + inputstring_len; + + while (inputstring_len > 0) + { +- int chunk_len = pg_mblen(start_ptr); ++ int chunk_len = pg_mblen_range(start_ptr, end_ptr); + + CHECK_FOR_INTERRUPTS(); + +@@ -5440,7 +5489,7 @@ text_reverse(PG_FUNCTION_ARGS) + { + int sz; + +- sz = pg_mblen(p); ++ sz = pg_mblen_range(p, endp); + dst -= sz; + memcpy(dst, p, sz); + p += sz; +@@ -5600,8 +5649,8 @@ text_format(PG_FUNCTION_ARGS) + if (strchr("sIL", *cp) == NULL) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), +- errmsg("unrecognized format() type specifier \"%c\"", +- *cp), ++ errmsg("unrecognized format() type specifier \"%.*s\"", ++ pg_mblen_range(cp, end_ptr), cp), + errhint("For a single \"%%\" use \"%%%%\"."))); + + /* If indirect width was specified, get its value */ +@@ -5721,8 +5770,8 @@ text_format(PG_FUNCTION_ARGS) + /* should not get here, because of previous check */ + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), +- errmsg("unrecognized format() type specifier \"%c\"", +- *cp), ++ errmsg("unrecognized format() type specifier \"%.*s\"", ++ pg_mblen_range(cp, end_ptr), cp), + errhint("For a single \"%%\" use \"%%%%\"."))); + break; + } +diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c +index 4cb1d5d9d6c..e0d8351652a 100644 +--- a/src/backend/utils/adt/xml.c ++++ b/src/backend/utils/adt/xml.c +@@ -2037,8 +2037,7 @@ sqlchar_to_unicode(const char *s) + char *utf8string; + pg_wchar ret[2]; /* need space for trailing zero */ + +- /* note we're not assuming s is null-terminated */ +- utf8string = pg_server_to_any(s, pg_mblen(s), PG_UTF8); ++ utf8string = pg_server_to_any(s, pg_mblen_cstr(s), PG_UTF8); + + pg_encoding_mb2wchar_with_len(PG_UTF8, utf8string, ret, + pg_encoding_mblen(PG_UTF8, utf8string)); +@@ -2091,7 +2090,7 @@ map_sql_identifier_to_xml_name(const char *ident, bool fully_escaped, + + initStringInfo(&buf); + +- for (p = ident; *p; p += pg_mblen(p)) ++ for (p = ident; *p; p += pg_mblen_cstr(p)) + { + if (*p == ':' && (p == ident || fully_escaped)) + appendStringInfoString(&buf, "_x003A_"); +@@ -2116,7 +2115,7 @@ map_sql_identifier_to_xml_name(const char *ident, bool fully_escaped, + : !is_valid_xml_namechar(u)) + appendStringInfo(&buf, "_x%04X_", (unsigned int) u); + else +- appendBinaryStringInfo(&buf, p, pg_mblen(p)); ++ appendBinaryStringInfo(&buf, p, pg_mblen_cstr(p)); + } + } + +@@ -2139,7 +2138,7 @@ map_xml_name_to_sql_identifier(const char *name) + + initStringInfo(&buf); + +- for (p = name; *p; p += pg_mblen(p)) ++ for (p = name; *p; p += pg_mblen_cstr(p)) + { + if (*p == '_' && *(p + 1) == 'x' + && isxdigit((unsigned char) *(p + 2)) +@@ -2157,7 +2156,7 @@ map_xml_name_to_sql_identifier(const char *name) + p += 6; + } + else +- appendBinaryStringInfo(&buf, p, pg_mblen(p)); ++ appendBinaryStringInfo(&buf, p, pg_mblen_cstr(p)); + } + + return buf.data; +diff --git a/src/backend/utils/cache/syscache.c b/src/backend/utils/cache/syscache.c +index 8856ae1508f..cec0b83fcfe 100644 +--- a/src/backend/utils/cache/syscache.c ++++ b/src/backend/utils/cache/syscache.c +@@ -40,6 +40,7 @@ + #include "catalog/pg_description.h" + #include "catalog/pg_enum.h" + #include "catalog/pg_event_trigger.h" ++#include "catalog/pg_extension.h" + #include "catalog/pg_foreign_data_wrapper.h" + #include "catalog/pg_foreign_server.h" + #include "catalog/pg_foreign_table.h" +@@ -983,6 +984,18 @@ static const struct cachedesc cacheinfo[] = { + 0 + }, + 2 ++ }, ++ /* intentionally out of alphabetical order, to avoid an ABI break: */ ++ {ExtensionRelationId, /* EXTENSIONOID */ ++ ExtensionOidIndexId, ++ 1, ++ { ++ Anum_pg_extension_oid, ++ 0, ++ 0, ++ 0 ++ }, ++ 2 + } + }; + +diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c +index 831f38b6758..82b39470662 100644 +--- a/src/backend/utils/mb/mbutils.c ++++ b/src/backend/utils/mb/mbutils.c +@@ -38,6 +38,7 @@ + #include "catalog/namespace.h" + #include "mb/pg_wchar.h" + #include "utils/builtins.h" ++#include "utils/memdebug.h" + #include "utils/memutils.h" + #include "utils/syscache.h" + +@@ -96,6 +97,13 @@ static char *perform_default_encoding_conversion(const char *src, + int len, bool is_client_to_server); + static int cliplen(const char *str, int len, int limit); + ++pg_attribute_noreturn() ++static void report_invalid_encoding_int(int encoding, const char *mbstr, ++ int mblen, int len); ++ ++pg_attribute_noreturn() ++static void report_invalid_encoding_db(const char *mbstr, int mblen, int len); ++ + + /* + * Prepare for a future call to SetClientEncoding. Success should mean +@@ -902,11 +910,128 @@ pg_encoding_wchar2mb_with_len(int encoding, + return pg_wchar_table[encoding].wchar2mb_with_len(from, (unsigned char *) to, len); + } + +-/* returns the byte length of a multibyte character */ ++/* ++ * Returns the byte length of a multibyte character sequence in a ++ * null-terminated string. Raises an illegal byte sequence error if the ++ * sequence would hit a null terminator. ++ * ++ * The caller is expected to have checked for a terminator at *mbstr == 0 ++ * before calling, but some callers want 1 in that case, so this function ++ * continues that tradition. ++ * ++ * This must only be used for strings that have a null-terminator to enable ++ * bounds detection. ++ */ ++int ++pg_mblen_cstr(const char *mbstr) ++{ ++ int length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr); ++ ++ /* ++ * The .mblen functions return 1 when given a pointer to a terminator. ++ * Some callers depend on that, so we tolerate it for now. Well-behaved ++ * callers check the leading byte for a terminator *before* calling. ++ */ ++ for (int i = 1; i < length; ++i) ++ if (unlikely(mbstr[i] == 0)) ++ report_invalid_encoding_db(mbstr, length, i); ++ ++ /* ++ * String should be NUL-terminated, but checking that would make typical ++ * callers O(N^2), tripling Valgrind check-world time. Unless ++ * VALGRIND_EXPENSIVE, check 1 byte after each actual character. (If we ++ * found a character, not a terminator, the next byte must be a terminator ++ * or the start of the next character.) If the caller iterates the whole ++ * string, the last call will diagnose a missing terminator. ++ */ ++ if (mbstr[0] != '\0') ++ { ++#ifdef VALGRIND_EXPENSIVE ++ VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, strlen(mbstr)); ++#else ++ VALGRIND_CHECK_MEM_IS_DEFINED(mbstr + length, 1); ++#endif ++ } ++ ++ return length; ++} ++ ++/* ++ * Returns the byte length of a multibyte character sequence bounded by a range ++ * [mbstr, end) of at least one byte in size. Raises an illegal byte sequence ++ * error if the sequence would exceed the range. ++ */ ++int ++pg_mblen_range(const char *mbstr, const char *end) ++{ ++ int length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr); ++ ++ Assert(end > mbstr); ++ ++ if (unlikely(mbstr + length > end)) ++ report_invalid_encoding_db(mbstr, length, end - mbstr); ++ ++#ifdef VALGRIND_EXPENSIVE ++ VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, end - mbstr); ++#else ++ VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length); ++#endif ++ ++ return length; ++} ++ ++/* ++ * Returns the byte length of a multibyte character sequence bounded by a range ++ * extending for 'limit' bytes, which must be at least one. Raises an illegal ++ * byte sequence error if the sequence would exceed the range. ++ */ ++int ++pg_mblen_with_len(const char *mbstr, int limit) ++{ ++ int length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr); ++ ++ Assert(limit >= 1); ++ ++ if (unlikely(length > limit)) ++ report_invalid_encoding_db(mbstr, length, limit); ++ ++#ifdef VALGRIND_EXPENSIVE ++ VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, limit); ++#else ++ VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length); ++#endif ++ ++ return length; ++} ++ ++ ++/* ++ * Returns the length of a multibyte character sequence, without any ++ * validation of bounds. ++ * ++ * PLEASE NOTE: This function can only be used safely if the caller has ++ * already verified the input string, since otherwise there is a risk of ++ * overrunning the buffer if the string is invalid. A prior call to a ++ * pg_mbstrlen* function suffices. ++ */ ++int ++pg_mblen_unbounded(const char *mbstr) ++{ ++ int length = pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr); ++ ++ VALGRIND_CHECK_MEM_IS_DEFINED(mbstr, length); ++ ++ return length; ++} ++ ++/* ++ * Historical name for pg_mblen_unbounded(). Should not be used and will be ++ * removed in a later version. ++ */ + int + pg_mblen(const char *mbstr) + { +- return pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr); ++ return pg_mblen_unbounded(mbstr); + } + + /* returns the display length of a multibyte character */ +@@ -928,14 +1053,14 @@ pg_mbstrlen(const char *mbstr) + + while (*mbstr) + { +- mbstr += pg_mblen(mbstr); ++ mbstr += pg_mblen_cstr(mbstr); + len++; + } + return len; + } + + /* returns the length (counted in wchars) of a multibyte string +- * (not necessarily NULL terminated) ++ * (stops at the first of "limit" or a NUL) + */ + int + pg_mbstrlen_with_len(const char *mbstr, int limit) +@@ -948,7 +1073,7 @@ pg_mbstrlen_with_len(const char *mbstr, int limit) + + while (limit > 0 && *mbstr) + { +- int l = pg_mblen(mbstr); ++ int l = pg_mblen_with_len(mbstr, limit); + + limit -= l; + mbstr += l; +@@ -1018,7 +1143,7 @@ pg_mbcharcliplen(const char *mbstr, int len, int limit) + + while (len > 0 && *mbstr) + { +- l = pg_mblen(mbstr); ++ l = pg_mblen_with_len(mbstr, len); + nch++; + if (nch > limit) + break; +@@ -1575,12 +1700,19 @@ void + report_invalid_encoding(int encoding, const char *mbstr, int len) + { + int l = pg_encoding_mblen_or_incomplete(encoding, mbstr, len); ++ ++ report_invalid_encoding_int(encoding, mbstr, l, len); ++} ++ ++static void ++report_invalid_encoding_int(int encoding, const char *mbstr, int mblen, int len) ++{ + char buf[8 * 5 + 1]; + char *p = buf; + int j, + jlimit; + +- jlimit = Min(l, len); ++ jlimit = Min(mblen, len); + jlimit = Min(jlimit, 8); /* prevent buffer overrun */ + + for (j = 0; j < jlimit; j++) +@@ -1597,6 +1729,12 @@ report_invalid_encoding(int encoding, const char *mbstr, int len) + buf))); + } + ++static void ++report_invalid_encoding_db(const char *mbstr, int mblen, int len) ++{ ++ report_invalid_encoding_int(GetDatabaseEncoding(), mbstr, mblen, len); ++} ++ + /* + * report_untranslatable_char: complain about untranslatable character + * +diff --git a/src/common/wchar.c b/src/common/wchar.c +index 78c60eeef97..e0c3a7c89d5 100644 +--- a/src/common/wchar.c ++++ b/src/common/wchar.c +@@ -266,12 +266,22 @@ pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) + return cnt; + } + ++/* ++ * mbverifychar does not accept SS2 or SS3 (CS2 and CS3 are not defined for ++ * EUC_CN), but mb2wchar_with_len does. Tell a coherent story for code that ++ * relies on agreement between mb2wchar_with_len and mblen. Invalid text ++ * datums (e.g. from shared catalogs) reach this. ++ */ + static int + pg_euccn_mblen(const unsigned char *s) + { + int len; + +- if (IS_HIGHBIT_SET(*s)) ++ if (*s == SS2) ++ len = 3; ++ else if (*s == SS3) ++ len = 3; ++ else if (IS_HIGHBIT_SET(*s)) + len = 2; + else + len = 1; +@@ -1554,7 +1564,7 @@ pg_encoding_set_invalid(int encoding, char *dst) + const pg_wchar_tbl pg_wchar_table[] = { + {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifier, 1}, /* PG_SQL_ASCII */ + {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3}, /* PG_EUC_JP */ +- {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifier, 2}, /* PG_EUC_CN */ ++ {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifier, 3}, /* PG_EUC_CN */ + {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifier, 3}, /* PG_EUC_KR */ + {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifier, 4}, /* PG_EUC_TW */ + {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3}, /* PG_EUC_JIS_2004 */ +diff --git a/src/include/access/slru.h b/src/include/access/slru.h +index 19982f6e226..c1e481b3647 100644 +--- a/src/include/access/slru.h ++++ b/src/include/access/slru.h +@@ -92,7 +92,9 @@ typedef struct SlruSharedData + /* + * latest_page_number is the page number of the current end of the log; + * this is not critical data, since we use it only to avoid swapping out +- * the latest page. ++ * the latest page. (An exception: an accurate latest_page_number is ++ * needed on pg_multixact/offsets to replay WAL generated with older minor ++ * versions correctly. See RecordNewMultiXact().) + */ + int latest_page_number; + +diff --git a/src/include/catalog/dependency.h b/src/include/catalog/dependency.h +index be235ed6d04..6c042320de1 100644 +--- a/src/include/catalog/dependency.h ++++ b/src/include/catalog/dependency.h +@@ -226,6 +226,8 @@ extern long changeDependenciesOn(Oid refClassId, Oid oldRefObjectId, + extern Oid getExtensionOfObject(Oid classId, Oid objectId); + extern List *getAutoExtensionsOfObject(Oid classId, Oid objectId); + ++extern Oid getExtensionType(Oid extensionOid, const char *typname); ++ + extern bool sequenceIsOwned(Oid seqId, char deptype, Oid *tableId, int32 *colId); + extern List *getOwnedSequences(Oid relid); + extern Oid getIdentitySequence(Oid relid, AttrNumber attnum, bool missing_ok); +diff --git a/src/include/commands/extension.h b/src/include/commands/extension.h +index 8b06df02a72..09ad68450a7 100644 +--- a/src/include/commands/extension.h ++++ b/src/include/commands/extension.h +@@ -49,6 +49,8 @@ extern Oid get_extension_oid(const char *extname, bool missing_ok); + extern char *get_extension_name(Oid ext_oid); + extern bool extension_file_exists(const char *extensionName); + ++extern Oid get_function_sibling_type(Oid funcoid, const char *typname); ++ + extern ObjectAddress AlterExtensionNamespace(const char *extensionName, const char *newschema, + Oid *oldschema); + +diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h +index c970e51b41b..d3b38814825 100644 +--- a/src/include/mb/pg_wchar.h ++++ b/src/include/mb/pg_wchar.h +@@ -588,7 +588,14 @@ extern int pg_char_and_wchar_strcmp(const char *s1, const pg_wchar *s2); + extern int pg_wchar_strncmp(const pg_wchar *s1, const pg_wchar *s2, size_t n); + extern int pg_char_and_wchar_strncmp(const char *s1, const pg_wchar *s2, size_t n); + extern size_t pg_wchar_strlen(const pg_wchar *wstr); ++extern int pg_mblen_cstr(const char *mbstr); ++extern int pg_mblen_range(const char *mbstr, const char *end); ++extern int pg_mblen_with_len(const char *mbstr, int limit); ++extern int pg_mblen_unbounded(const char *mbstr); ++ ++/* deprecated */ + extern int pg_mblen(const char *mbstr); ++ + extern int pg_dsplen(const char *mbstr); + extern int pg_mbstrlen(const char *mbstr); + extern int pg_mbstrlen_with_len(const char *mbstr, int len); +diff --git a/src/include/tsearch/ts_locale.h b/src/include/tsearch/ts_locale.h +index cc4bd9ab20d..b54345659e7 100644 +--- a/src/include/tsearch/ts_locale.h ++++ b/src/include/tsearch/ts_locale.h +@@ -42,12 +42,36 @@ typedef struct + /* The second argument of t_iseq() must be a plain ASCII character */ + #define t_iseq(x,c) (TOUCHAR(x) == (unsigned char) (c)) + +-#define COPYCHAR(d,s) memcpy(d, s, pg_mblen(s)) ++/* Copy multibyte character of known byte length, return byte length. */ ++static inline int ++ts_copychar_with_len(void *dest, const void *src, int length) ++{ ++ memcpy(dest, src, length); ++ return length; ++} ++ ++/* Copy multibyte character from null-terminated string, return byte length. */ ++static inline int ++ts_copychar_cstr(void *dest, const void *src) ++{ ++ return ts_copychar_with_len(dest, src, pg_mblen_cstr((const char *) src)); ++} ++ ++/* Historical macro for the above. */ ++#define COPYCHAR ts_copychar_cstr ++ ++#define GENERATE_T_ISCLASS_DECL(character_class) \ ++extern int t_is##character_class##_with_len(const char *ptr, int len); \ ++extern int t_is##character_class##_cstr(const char *ptr); \ ++extern int t_is##character_class##_unbounded(const char *ptr); \ ++\ ++/* deprecated */ \ ++extern int t_is##character_class(const char *ptr); + +-extern int t_isdigit(const char *ptr); +-extern int t_isspace(const char *ptr); +-extern int t_isalpha(const char *ptr); +-extern int t_isprint(const char *ptr); ++GENERATE_T_ISCLASS_DECL(alpha); ++GENERATE_T_ISCLASS_DECL(digit); ++GENERATE_T_ISCLASS_DECL(print); ++GENERATE_T_ISCLASS_DECL(space); + + extern char *lowerstr(const char *str); + extern char *lowerstr_with_len(const char *str, int len); +diff --git a/src/include/tsearch/ts_utils.h b/src/include/tsearch/ts_utils.h +index a9b6f596585..0fad51b7e06 100644 +--- a/src/include/tsearch/ts_utils.h ++++ b/src/include/tsearch/ts_utils.h +@@ -38,14 +38,12 @@ extern bool gettoken_tsvector(TSVectorParseState state, + extern void close_tsvector_parser(TSVectorParseState state); + + /* phrase operator begins with '<' */ +-#define ISOPERATOR(x) \ +- ( pg_mblen(x) == 1 && ( *(x) == '!' || \ +- *(x) == '&' || \ +- *(x) == '|' || \ +- *(x) == '(' || \ +- *(x) == ')' || \ +- *(x) == '<' \ +- ) ) ++#define ISOPERATOR(x) (*(x) == '!' || \ ++ *(x) == '&' || \ ++ *(x) == '|' || \ ++ *(x) == '(' || \ ++ *(x) == ')' || \ ++ *(x) == '<') + + /* parse_tsquery */ + +diff --git a/src/include/utils/array.h b/src/include/utils/array.h +index 3a3fc0f910d..78952a34e46 100644 +--- a/src/include/utils/array.h ++++ b/src/include/utils/array.h +@@ -389,6 +389,7 @@ extern void array_bitmap_copy(bits8 *destbitmap, int destoffset, + extern ArrayType *construct_array(Datum *elems, int nelems, + Oid elmtype, + int elmlen, bool elmbyval, char elmalign); ++extern ArrayType *construct_array_builtin(Datum *elems, int nelems, Oid elmtype); + extern ArrayType *construct_md_array(Datum *elems, + bool *nulls, + int ndims, +@@ -403,6 +404,9 @@ extern void deconstruct_array(ArrayType *array, + Oid elmtype, + int elmlen, bool elmbyval, char elmalign, + Datum **elemsp, bool **nullsp, int *nelemsp); ++extern void deconstruct_array_builtin(ArrayType *array, ++ Oid elmtype, ++ Datum **elemsp, bool **nullsp, int *nelemsp); + extern bool array_contains_nulls(ArrayType *array); + + extern ArrayBuildState *initArrayResult(Oid element_type, +diff --git a/src/include/utils/syscache.h b/src/include/utils/syscache.h +index 067ac0928a2..1433cbe8754 100644 +--- a/src/include/utils/syscache.h ++++ b/src/include/utils/syscache.h +@@ -108,9 +108,11 @@ enum SysCacheIdentifier + TYPENAMENSP, + TYPEOID, + USERMAPPINGOID, +- USERMAPPINGUSERSERVER ++ USERMAPPINGUSERSERVER, ++ /* intentionally out of alphabetical order, to avoid an ABI break: */ ++ EXTENSIONOID + +-#define SysCacheSize (USERMAPPINGUSERSERVER + 1) ++#define SysCacheSize (EXTENSIONOID + 1) + }; + + extern void InitCatalogCache(void); +diff --git a/src/test/modules/test_regex/test_regex.c b/src/test/modules/test_regex/test_regex.c +new file mode 100644 +index 00000000000..b8ff535c8f3 +--- /dev/null ++++ b/src/test/modules/test_regex/test_regex.c +@@ -0,0 +1,774 @@ ++/*-------------------------------------------------------------------------- ++ * ++ * test_regex.c ++ * Test harness for the regular expression package. ++ * ++ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group ++ * Portions Copyright (c) 1994, Regents of the University of California ++ * ++ * IDENTIFICATION ++ * src/test/modules/test_regex/test_regex.c ++ * ++ * ------------------------------------------------------------------------- ++ */ ++ ++#include "postgres.h" ++ ++#include "funcapi.h" ++#include "miscadmin.h" ++#include "regex/regex.h" ++#include "utils/array.h" ++#include "utils/builtins.h" ++ ++PG_MODULE_MAGIC; ++ ++ ++/* all the options of interest for regex functions */ ++typedef struct test_re_flags ++{ ++ int cflags; /* compile flags for Spencer's regex code */ ++ int eflags; /* execute flags for Spencer's regex code */ ++ long info; /* expected re_info bits */ ++ bool glob; /* do it globally (for each occurrence) */ ++ bool indices; /* report indices not actual strings */ ++ bool partial; /* expect partial match */ ++} test_re_flags; ++ ++/* cross-call state for test_regex() */ ++typedef struct test_regex_ctx ++{ ++ test_re_flags re_flags; /* flags */ ++ rm_detail_t details; /* "details" from execution */ ++ text *orig_str; /* data string in original TEXT form */ ++ int nmatches; /* number of places where pattern matched */ ++ int npatterns; /* number of capturing subpatterns */ ++ /* We store start char index and end+1 char index for each match */ ++ /* so the number of entries in match_locs is nmatches * npatterns * 2 */ ++ int *match_locs; /* 0-based character indexes */ ++ int next_match; /* 0-based index of next match to process */ ++ /* workspace for build_test_match_result() */ ++ Datum *elems; /* has npatterns+1 elements */ ++ bool *nulls; /* has npatterns+1 elements */ ++ pg_wchar *wide_str; /* wide-char version of original string */ ++ char *conv_buf; /* conversion buffer, if needed */ ++ int conv_bufsiz; /* size thereof */ ++} test_regex_ctx; ++ ++/* Local functions */ ++static void test_re_compile(text *text_re, int cflags, Oid collation, ++ regex_t *result_re); ++static void parse_test_flags(test_re_flags *flags, text *opts); ++static test_regex_ctx *setup_test_matches(text *orig_str, ++ regex_t *cpattern, ++ test_re_flags *flags, ++ Oid collation, ++ bool use_subpatterns); ++static ArrayType *build_test_info_result(regex_t *cpattern, ++ test_re_flags *flags); ++static ArrayType *build_test_match_result(test_regex_ctx *matchctx); ++ ++ ++/* ++ * test_regex(pattern text, string text, flags text) returns setof text[] ++ * ++ * This is largely based on regexp.c's regexp_matches, with additions ++ * for debugging purposes. ++ */ ++PG_FUNCTION_INFO_V1(test_regex); ++ ++Datum ++test_regex(PG_FUNCTION_ARGS) ++{ ++ FuncCallContext *funcctx; ++ test_regex_ctx *matchctx; ++ ArrayType *result_ary; ++ ++ if (SRF_IS_FIRSTCALL()) ++ { ++ text *pattern = PG_GETARG_TEXT_PP(0); ++ text *flags = PG_GETARG_TEXT_PP(2); ++ Oid collation = PG_GET_COLLATION(); ++ test_re_flags re_flags; ++ regex_t cpattern; ++ MemoryContext oldcontext; ++ ++ funcctx = SRF_FIRSTCALL_INIT(); ++ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); ++ ++ /* Determine options */ ++ parse_test_flags(&re_flags, flags); ++ ++ /* set up the compiled pattern */ ++ test_re_compile(pattern, re_flags.cflags, collation, &cpattern); ++ ++ /* be sure to copy the input string into the multi-call ctx */ ++ matchctx = setup_test_matches(PG_GETARG_TEXT_P_COPY(1), &cpattern, ++ &re_flags, ++ collation, ++ true); ++ ++ /* Pre-create workspace that build_test_match_result needs */ ++ matchctx->elems = (Datum *) palloc(sizeof(Datum) * ++ (matchctx->npatterns + 1)); ++ matchctx->nulls = (bool *) palloc(sizeof(bool) * ++ (matchctx->npatterns + 1)); ++ ++ MemoryContextSwitchTo(oldcontext); ++ funcctx->user_fctx = (void *) matchctx; ++ ++ /* ++ * Return the first result row, which is info equivalent to Tcl's ++ * "regexp -about" output ++ */ ++ result_ary = build_test_info_result(&cpattern, &re_flags); ++ ++ pg_regfree(&cpattern); ++ ++ SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary)); ++ } ++ else ++ { ++ /* Each subsequent row describes one match */ ++ funcctx = SRF_PERCALL_SETUP(); ++ matchctx = (test_regex_ctx *) funcctx->user_fctx; ++ ++ if (matchctx->next_match < matchctx->nmatches) ++ { ++ result_ary = build_test_match_result(matchctx); ++ matchctx->next_match++; ++ SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary)); ++ } ++ } ++ ++ SRF_RETURN_DONE(funcctx); ++} ++ ++ ++/* ++ * test_re_compile - compile a RE ++ * ++ * text_re --- the pattern, expressed as a TEXT object ++ * cflags --- compile options for the pattern ++ * collation --- collation to use for LC_CTYPE-dependent behavior ++ * result_re --- output, compiled RE is stored here ++ * ++ * Pattern is given in the database encoding. We internally convert to ++ * an array of pg_wchar, which is what Spencer's regex package wants. ++ * ++ * Caller must eventually pg_regfree the resulting RE to avoid memory leaks. ++ */ ++static void ++test_re_compile(text *text_re, int cflags, Oid collation, ++ regex_t *result_re) ++{ ++ int text_re_len = VARSIZE_ANY_EXHDR(text_re); ++ char *text_re_val = VARDATA_ANY(text_re); ++ pg_wchar *pattern; ++ int pattern_len; ++ int regcomp_result; ++ char errMsg[100]; ++ ++ /* Convert pattern string to wide characters */ ++ pattern = (pg_wchar *) palloc((text_re_len + 1) * sizeof(pg_wchar)); ++ pattern_len = pg_mb2wchar_with_len(text_re_val, ++ pattern, ++ text_re_len); ++ ++ regcomp_result = pg_regcomp(result_re, ++ pattern, ++ pattern_len, ++ cflags, ++ collation); ++ ++ pfree(pattern); ++ ++ if (regcomp_result != REG_OKAY) ++ { ++ /* re didn't compile (no need for pg_regfree, if so) */ ++ ++ /* ++ * Here and in other places in this file, do CHECK_FOR_INTERRUPTS ++ * before reporting a regex error. This is so that if the regex ++ * library aborts and returns REG_CANCEL, we don't print an error ++ * message that implies the regex was invalid. ++ */ ++ CHECK_FOR_INTERRUPTS(); ++ ++ pg_regerror(regcomp_result, result_re, errMsg, sizeof(errMsg)); ++ ereport(ERROR, ++ (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION), ++ errmsg("invalid regular expression: %s", errMsg))); ++ } ++} ++ ++/* ++ * test_re_execute - execute a RE on pg_wchar data ++ * ++ * Returns true on match, false on no match ++ * Arguments are as for pg_regexec ++ */ ++static bool ++test_re_execute(regex_t *re, pg_wchar *data, int data_len, ++ int start_search, ++ rm_detail_t *details, ++ int nmatch, regmatch_t *pmatch, ++ int eflags) ++{ ++ int regexec_result; ++ char errMsg[100]; ++ ++ /* Initialize match locations in case engine doesn't */ ++ details->rm_extend.rm_so = -1; ++ details->rm_extend.rm_eo = -1; ++ for (int i = 0; i < nmatch; i++) ++ { ++ pmatch[i].rm_so = -1; ++ pmatch[i].rm_eo = -1; ++ } ++ ++ /* Perform RE match and return result */ ++ regexec_result = pg_regexec(re, ++ data, ++ data_len, ++ start_search, ++ details, ++ nmatch, ++ pmatch, ++ eflags); ++ ++ if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH) ++ { ++ /* re failed??? */ ++ CHECK_FOR_INTERRUPTS(); ++ pg_regerror(regexec_result, re, errMsg, sizeof(errMsg)); ++ ereport(ERROR, ++ (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION), ++ errmsg("regular expression failed: %s", errMsg))); ++ } ++ ++ return (regexec_result == REG_OKAY); ++} ++ ++ ++/* ++ * parse_test_flags - parse the flags argument ++ * ++ * flags --- output argument, filled with desired options ++ * opts --- TEXT object, or NULL for defaults ++ */ ++static void ++parse_test_flags(test_re_flags *flags, text *opts) ++{ ++ /* these defaults must match Tcl's */ ++ int cflags = REG_ADVANCED; ++ int eflags = 0; ++ long info = 0; ++ ++ flags->glob = false; ++ flags->indices = false; ++ flags->partial = false; ++ ++ if (opts) ++ { ++ char *opt_p = VARDATA_ANY(opts); ++ int opt_len = VARSIZE_ANY_EXHDR(opts); ++ int i; ++ ++ for (i = 0; i < opt_len; i++) ++ { ++ switch (opt_p[i]) ++ { ++ case '-': ++ /* allowed, no-op */ ++ break; ++ case '!': ++ flags->partial = true; ++ break; ++ case '*': ++ /* test requires Unicode --- ignored here */ ++ break; ++ case '0': ++ flags->indices = true; ++ break; ++ ++ /* These flags correspond to user-exposed RE options: */ ++ case 'g': /* global match */ ++ flags->glob = true; ++ break; ++ case 'i': /* case insensitive */ ++ cflags |= REG_ICASE; ++ break; ++ case 'n': /* \n affects ^ $ . [^ */ ++ cflags |= REG_NEWLINE; ++ break; ++ case 'p': /* ~Perl, \n affects . [^ */ ++ cflags |= REG_NLSTOP; ++ cflags &= ~REG_NLANCH; ++ break; ++ case 'w': /* weird, \n affects ^ $ only */ ++ cflags &= ~REG_NLSTOP; ++ cflags |= REG_NLANCH; ++ break; ++ case 'x': /* expanded syntax */ ++ cflags |= REG_EXPANDED; ++ break; ++ ++ /* These flags correspond to Tcl's -xflags options: */ ++ case 'a': ++ cflags |= REG_ADVF; ++ break; ++ case 'b': ++ cflags &= ~REG_ADVANCED; ++ break; ++ case 'c': ++ ++ /* ++ * Tcl calls this TCL_REG_CANMATCH, but it's really ++ * REG_EXPECT. In this implementation we must also set ++ * the partial and indices flags, so that ++ * setup_test_matches and build_test_match_result will ++ * emit the desired data. (They'll emit more fields than ++ * Tcl would, but that's fine.) ++ */ ++ cflags |= REG_EXPECT; ++ flags->partial = true; ++ flags->indices = true; ++ break; ++ case 'e': ++ cflags &= ~REG_ADVANCED; ++ cflags |= REG_EXTENDED; ++ break; ++ case 'q': ++ cflags &= ~REG_ADVANCED; ++ cflags |= REG_QUOTE; ++ break; ++ case 'o': /* o for opaque */ ++ cflags |= REG_NOSUB; ++ break; ++ case 's': /* s for start */ ++ cflags |= REG_BOSONLY; ++ break; ++ case '+': ++ cflags |= REG_FAKE; ++ break; ++ case ',': ++ cflags |= REG_PROGRESS; ++ break; ++ case '.': ++ cflags |= REG_DUMP; ++ break; ++ case ':': ++ eflags |= REG_MTRACE; ++ break; ++ case ';': ++ eflags |= REG_FTRACE; ++ break; ++ case '^': ++ eflags |= REG_NOTBOL; ++ break; ++ case '$': ++ eflags |= REG_NOTEOL; ++ break; ++ case 't': ++ cflags |= REG_EXPECT; ++ break; ++ case '%': ++ eflags |= REG_SMALL; ++ break; ++ ++ /* These flags define expected info bits: */ ++ case 'A': ++ info |= REG_UBSALNUM; ++ break; ++ case 'B': ++ info |= REG_UBRACES; ++ break; ++ case 'E': ++ info |= REG_UBBS; ++ break; ++ case 'H': ++ info |= REG_ULOOKAROUND; ++ break; ++ case 'I': ++ info |= REG_UIMPOSSIBLE; ++ break; ++ case 'L': ++ info |= REG_ULOCALE; ++ break; ++ case 'M': ++ info |= REG_UUNPORT; ++ break; ++ case 'N': ++ info |= REG_UEMPTYMATCH; ++ break; ++ case 'P': ++ info |= REG_UNONPOSIX; ++ break; ++ case 'Q': ++ info |= REG_UBOUNDS; ++ break; ++ case 'R': ++ info |= REG_UBACKREF; ++ break; ++ case 'S': ++ info |= REG_UUNSPEC; ++ break; ++ case 'T': ++ info |= REG_USHORTEST; ++ break; ++ case 'U': ++ info |= REG_UPBOTCH; ++ break; ++ ++ default: ++ ereport(ERROR, ++ (errcode(ERRCODE_INVALID_PARAMETER_VALUE), ++ errmsg("invalid regular expression test option: \"%.*s\"", ++ pg_mblen_range(opt_p + i, opt_p + opt_len), ++ opt_p + i))); ++ break; ++ } ++ } ++ } ++ flags->cflags = cflags; ++ flags->eflags = eflags; ++ flags->info = info; ++} ++ ++/* ++ * setup_test_matches --- do the initial matching ++ * ++ * To simplify memory management, we do all the matching in one swoop. ++ * The returned test_regex_ctx contains the locations of all the substrings ++ * matching the pattern. ++ */ ++static test_regex_ctx * ++setup_test_matches(text *orig_str, ++ regex_t *cpattern, test_re_flags *re_flags, ++ Oid collation, ++ bool use_subpatterns) ++{ ++ test_regex_ctx *matchctx = palloc0(sizeof(test_regex_ctx)); ++ int eml = pg_database_encoding_max_length(); ++ int orig_len; ++ pg_wchar *wide_str; ++ int wide_len; ++ regmatch_t *pmatch; ++ int pmatch_len; ++ int array_len; ++ int array_idx; ++ int prev_match_end; ++ int start_search; ++ int maxlen = 0; /* largest fetch length in characters */ ++ ++ /* save flags */ ++ matchctx->re_flags = *re_flags; ++ ++ /* save original string --- we'll extract result substrings from it */ ++ matchctx->orig_str = orig_str; ++ ++ /* convert string to pg_wchar form for matching */ ++ orig_len = VARSIZE_ANY_EXHDR(orig_str); ++ wide_str = (pg_wchar *) palloc(sizeof(pg_wchar) * (orig_len + 1)); ++ wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len); ++ ++ /* do we want to remember subpatterns? */ ++ if (use_subpatterns && cpattern->re_nsub > 0) ++ { ++ matchctx->npatterns = cpattern->re_nsub + 1; ++ pmatch_len = cpattern->re_nsub + 1; ++ } ++ else ++ { ++ use_subpatterns = false; ++ matchctx->npatterns = 1; ++ pmatch_len = 1; ++ } ++ ++ /* temporary output space for RE package */ ++ pmatch = palloc(sizeof(regmatch_t) * pmatch_len); ++ ++ /* ++ * the real output space (grown dynamically if needed) ++ * ++ * use values 2^n-1, not 2^n, so that we hit the limit at 2^28-1 rather ++ * than at 2^27 ++ */ ++ array_len = re_flags->glob ? 255 : 31; ++ matchctx->match_locs = (int *) palloc(sizeof(int) * array_len); ++ array_idx = 0; ++ ++ /* search for the pattern, perhaps repeatedly */ ++ prev_match_end = 0; ++ start_search = 0; ++ while (test_re_execute(cpattern, wide_str, wide_len, ++ start_search, ++ &matchctx->details, ++ pmatch_len, pmatch, ++ re_flags->eflags)) ++ { ++ /* enlarge output space if needed */ ++ while (array_idx + matchctx->npatterns * 2 + 1 > array_len) ++ { ++ array_len += array_len + 1; /* 2^n-1 => 2^(n+1)-1 */ ++ if (array_len > MaxAllocSize / sizeof(int)) ++ ereport(ERROR, ++ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), ++ errmsg("too many regular expression matches"))); ++ matchctx->match_locs = (int *) repalloc(matchctx->match_locs, ++ sizeof(int) * array_len); ++ } ++ ++ /* save this match's locations */ ++ for (int i = 0; i < matchctx->npatterns; i++) ++ { ++ int so = pmatch[i].rm_so; ++ int eo = pmatch[i].rm_eo; ++ ++ matchctx->match_locs[array_idx++] = so; ++ matchctx->match_locs[array_idx++] = eo; ++ if (so >= 0 && eo >= 0 && (eo - so) > maxlen) ++ maxlen = (eo - so); ++ } ++ matchctx->nmatches++; ++ prev_match_end = pmatch[0].rm_eo; ++ ++ /* if not glob, stop after one match */ ++ if (!re_flags->glob) ++ break; ++ ++ /* ++ * Advance search position. Normally we start the next search at the ++ * end of the previous match; but if the match was of zero length, we ++ * have to advance by one character, or we'd just find the same match ++ * again. ++ */ ++ start_search = prev_match_end; ++ if (pmatch[0].rm_so == pmatch[0].rm_eo) ++ start_search++; ++ if (start_search > wide_len) ++ break; ++ } ++ ++ /* ++ * If we had no match, but "partial" and "indices" are set, emit the ++ * details. ++ */ ++ if (matchctx->nmatches == 0 && re_flags->partial && re_flags->indices) ++ { ++ /* enlarge output space if needed */ ++ while (array_idx + matchctx->npatterns * 2 + 1 > array_len) ++ { ++ array_len += array_len + 1; /* 2^n-1 => 2^(n+1)-1 */ ++ if (array_len > MaxAllocSize / sizeof(int)) ++ ereport(ERROR, ++ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), ++ errmsg("too many regular expression matches"))); ++ matchctx->match_locs = (int *) repalloc(matchctx->match_locs, ++ sizeof(int) * array_len); ++ } ++ ++ matchctx->match_locs[array_idx++] = matchctx->details.rm_extend.rm_so; ++ matchctx->match_locs[array_idx++] = matchctx->details.rm_extend.rm_eo; ++ /* we don't have pmatch data, so emit -1 */ ++ for (int i = 1; i < matchctx->npatterns; i++) ++ { ++ matchctx->match_locs[array_idx++] = -1; ++ matchctx->match_locs[array_idx++] = -1; ++ } ++ matchctx->nmatches++; ++ } ++ ++ Assert(array_idx <= array_len); ++ ++ if (eml > 1) ++ { ++ int64 maxsiz = eml * (int64) maxlen; ++ int conv_bufsiz; ++ ++ /* ++ * Make the conversion buffer large enough for any substring of ++ * interest. ++ * ++ * Worst case: assume we need the maximum size (maxlen*eml), but take ++ * advantage of the fact that the original string length in bytes is ++ * an upper bound on the byte length of any fetched substring (and we ++ * know that len+1 is safe to allocate because the varlena header is ++ * longer than 1 byte). ++ */ ++ if (maxsiz > orig_len) ++ conv_bufsiz = orig_len + 1; ++ else ++ conv_bufsiz = maxsiz + 1; /* safe since maxsiz < 2^30 */ ++ ++ matchctx->conv_buf = palloc(conv_bufsiz); ++ matchctx->conv_bufsiz = conv_bufsiz; ++ matchctx->wide_str = wide_str; ++ } ++ else ++ { ++ /* No need to keep the wide string if we're in a single-byte charset. */ ++ pfree(wide_str); ++ matchctx->wide_str = NULL; ++ matchctx->conv_buf = NULL; ++ matchctx->conv_bufsiz = 0; ++ } ++ ++ /* Clean up temp storage */ ++ pfree(pmatch); ++ ++ return matchctx; ++} ++ ++/* ++ * build_test_info_result - build output array describing compiled regexp ++ * ++ * This borrows some code from Tcl's TclRegAbout(). ++ */ ++static ArrayType * ++build_test_info_result(regex_t *cpattern, test_re_flags *flags) ++{ ++ /* Translation data for flag bits in regex_t.re_info */ ++ struct infoname ++ { ++ int bit; ++ const char *text; ++ }; ++ static const struct infoname infonames[] = { ++ {REG_UBACKREF, "REG_UBACKREF"}, ++ {REG_ULOOKAROUND, "REG_ULOOKAROUND"}, ++ {REG_UBOUNDS, "REG_UBOUNDS"}, ++ {REG_UBRACES, "REG_UBRACES"}, ++ {REG_UBSALNUM, "REG_UBSALNUM"}, ++ {REG_UPBOTCH, "REG_UPBOTCH"}, ++ {REG_UBBS, "REG_UBBS"}, ++ {REG_UNONPOSIX, "REG_UNONPOSIX"}, ++ {REG_UUNSPEC, "REG_UUNSPEC"}, ++ {REG_UUNPORT, "REG_UUNPORT"}, ++ {REG_ULOCALE, "REG_ULOCALE"}, ++ {REG_UEMPTYMATCH, "REG_UEMPTYMATCH"}, ++ {REG_UIMPOSSIBLE, "REG_UIMPOSSIBLE"}, ++ {REG_USHORTEST, "REG_USHORTEST"}, ++ {0, NULL} ++ }; ++ const struct infoname *inf; ++ Datum elems[lengthof(infonames) + 1]; ++ int nresults = 0; ++ char buf[80]; ++ int dims[1]; ++ int lbs[1]; ++ ++ /* Set up results: first, the number of subexpressions */ ++ snprintf(buf, sizeof(buf), "%d", (int) cpattern->re_nsub); ++ elems[nresults++] = PointerGetDatum(cstring_to_text(buf)); ++ ++ /* Report individual info bit states */ ++ for (inf = infonames; inf->bit != 0; inf++) ++ { ++ if (cpattern->re_info & inf->bit) ++ { ++ if (flags->info & inf->bit) ++ elems[nresults++] = PointerGetDatum(cstring_to_text(inf->text)); ++ else ++ { ++ snprintf(buf, sizeof(buf), "unexpected %s!", inf->text); ++ elems[nresults++] = PointerGetDatum(cstring_to_text(buf)); ++ } ++ } ++ else ++ { ++ if (flags->info & inf->bit) ++ { ++ snprintf(buf, sizeof(buf), "missing %s!", inf->text); ++ elems[nresults++] = PointerGetDatum(cstring_to_text(buf)); ++ } ++ } ++ } ++ ++ /* And form an array */ ++ dims[0] = nresults; ++ lbs[0] = 1; ++ /* XXX: this hardcodes assumptions about the text type */ ++ return construct_md_array(elems, NULL, 1, dims, lbs, ++ TEXTOID, -1, false, TYPALIGN_INT); ++} ++ ++/* ++ * build_test_match_result - build output array for current match ++ * ++ * Note that if the indices flag is set, we don't need any strings, ++ * just the location data. ++ */ ++static ArrayType * ++build_test_match_result(test_regex_ctx *matchctx) ++{ ++ char *buf = matchctx->conv_buf; ++ Datum *elems = matchctx->elems; ++ bool *nulls = matchctx->nulls; ++ bool indices = matchctx->re_flags.indices; ++ char bufstr[80]; ++ int dims[1]; ++ int lbs[1]; ++ int loc; ++ int i; ++ ++ /* Extract matching substrings from the original string */ ++ loc = matchctx->next_match * matchctx->npatterns * 2; ++ for (i = 0; i < matchctx->npatterns; i++) ++ { ++ int so = matchctx->match_locs[loc++]; ++ int eo = matchctx->match_locs[loc++]; ++ ++ if (indices) ++ { ++ /* Report eo this way for consistency with Tcl */ ++ snprintf(bufstr, sizeof(bufstr), "%d %d", ++ so, so < 0 ? eo : eo - 1); ++ elems[i] = PointerGetDatum(cstring_to_text(bufstr)); ++ nulls[i] = false; ++ } ++ else if (so < 0 || eo < 0) ++ { ++ elems[i] = (Datum) 0; ++ nulls[i] = true; ++ } ++ else if (buf) ++ { ++ int len = pg_wchar2mb_with_len(matchctx->wide_str + so, ++ buf, ++ eo - so); ++ ++ Assert(len < matchctx->conv_bufsiz); ++ elems[i] = PointerGetDatum(cstring_to_text_with_len(buf, len)); ++ nulls[i] = false; ++ } ++ else ++ { ++ elems[i] = DirectFunctionCall3(text_substr, ++ PointerGetDatum(matchctx->orig_str), ++ Int32GetDatum(so + 1), ++ Int32GetDatum(eo - so)); ++ nulls[i] = false; ++ } ++ } ++ ++ /* In EXPECT indices mode, also report the "details" */ ++ if (indices && (matchctx->re_flags.cflags & REG_EXPECT)) ++ { ++ int so = matchctx->details.rm_extend.rm_so; ++ int eo = matchctx->details.rm_extend.rm_eo; ++ ++ snprintf(bufstr, sizeof(bufstr), "%d %d", ++ so, so < 0 ? eo : eo - 1); ++ elems[i] = PointerGetDatum(cstring_to_text(bufstr)); ++ nulls[i] = false; ++ i++; ++ } ++ ++ /* And form an array */ ++ dims[0] = i; ++ lbs[0] = 1; ++ /* XXX: this hardcodes assumptions about the text type */ ++ return construct_md_array(elems, nulls, 1, dims, lbs, ++ TEXTOID, -1, false, TYPALIGN_INT); ++} +diff --git a/src/test/regress/expected/.gitignore b/src/test/regress/expected/.gitignore +index 93c56c85a09..398292afad5 100644 +--- a/src/test/regress/expected/.gitignore ++++ b/src/test/regress/expected/.gitignore +@@ -2,6 +2,8 @@ + /copy.out + /create_function_1.out + /create_function_2.out ++/encoding.out ++/encoding_1.out + /largeobject.out + /largeobject_1.out + /misc.out +diff --git a/src/test/regress/expected/euc_kr.out b/src/test/regress/expected/euc_kr.out +new file mode 100644 +index 00000000000..7a61c89a43a +--- /dev/null ++++ b/src/test/regress/expected/euc_kr.out +@@ -0,0 +1,16 @@ ++-- This test is about EUC_KR encoding, chosen as perhaps the most prevalent ++-- non-UTF8, multibyte encoding as of 2026-01. Since UTF8 can represent all ++-- of EUC_KR, also run the test in UTF8. ++SELECT getdatabaseencoding() NOT IN ('EUC_KR', 'UTF8') AS skip_test \gset ++\if :skip_test ++\quit ++\endif ++-- Exercise is_multibyte_char_in_char (non-UTF8) slow path. ++SELECT POSITION( ++ convert_from('\xbcf6c7d0', 'EUC_KR') IN ++ convert_from('\xb0fac7d02c20bcf6c7d02c20b1e2bcfa2c20bbee', 'EUC_KR')); ++ position ++---------- ++ 5 ++(1 row) ++ +diff --git a/src/test/regress/expected/euc_kr_1.out b/src/test/regress/expected/euc_kr_1.out +new file mode 100644 +index 00000000000..faaac5d6355 +--- /dev/null ++++ b/src/test/regress/expected/euc_kr_1.out +@@ -0,0 +1,6 @@ ++-- This test is about EUC_KR encoding, chosen as perhaps the most prevalent ++-- non-UTF8, multibyte encoding as of 2026-01. Since UTF8 can represent all ++-- of EUC_KR, also run the test in UTF8. ++SELECT getdatabaseencoding() NOT IN ('EUC_KR', 'UTF8') AS skip_test \gset ++\if :skip_test ++\quit +diff --git a/src/test/regress/input/encoding.source b/src/test/regress/input/encoding.source +new file mode 100644 +index 00000000000..efdfecd3c05 +--- /dev/null ++++ b/src/test/regress/input/encoding.source +@@ -0,0 +1,240 @@ ++/* skip test if not UTF8 server encoding */ ++SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset ++\if :skip_test ++\quit ++\endif ++ ++CREATE FUNCTION test_bytea_to_text(bytea) RETURNS text ++ AS '@libdir@/regress@DLSUFFIX@' LANGUAGE C STRICT; ++CREATE FUNCTION test_text_to_bytea(text) RETURNS bytea ++ AS '@libdir@/regress@DLSUFFIX@' LANGUAGE C STRICT; ++CREATE FUNCTION test_mblen_func(text, text, text, int) RETURNS int ++ AS '@libdir@/regress@DLSUFFIX@' LANGUAGE C STRICT; ++CREATE FUNCTION test_text_to_wchars(text, text) RETURNS int[] ++ AS '@libdir@/regress@DLSUFFIX@' LANGUAGE C STRICT; ++CREATE FUNCTION test_wchars_to_text(text, int[]) RETURNS text ++ AS '@libdir@/regress@DLSUFFIX@' LANGUAGE C STRICT; ++CREATE FUNCTION test_valid_server_encoding(text) RETURNS boolean ++ AS '@libdir@/regress@DLSUFFIX@' LANGUAGE C STRICT; ++ ++ ++CREATE TABLE regress_encoding(good text, truncated text, with_nul text, truncated_with_nul text); ++INSERT INTO regress_encoding ++VALUES ('café', ++ 'caf' || test_bytea_to_text('\xc3'), ++ 'café' || test_bytea_to_text('\x00') || 'dcba', ++ 'caf' || test_bytea_to_text('\xc300') || 'dcba'); ++ ++SELECT good, truncated, with_nul FROM regress_encoding; ++ ++SELECT length(good) FROM regress_encoding; ++SELECT substring(good, 3, 1) FROM regress_encoding; ++SELECT substring(good, 4, 1) FROM regress_encoding; ++SELECT regexp_replace(good, '^caf(.)$', '\1') FROM regress_encoding; ++SELECT reverse(good) FROM regress_encoding; ++ ++-- invalid short mb character = error ++SELECT length(truncated) FROM regress_encoding; ++SELECT substring(truncated, 1, 3) FROM regress_encoding; ++SELECT substring(truncated, 1, 4) FROM regress_encoding; ++SELECT reverse(truncated) FROM regress_encoding; ++-- invalid short mb character = silently dropped ++SELECT regexp_replace(truncated, '^caf(.)$', '\1') FROM regress_encoding; ++ ++-- PostgreSQL doesn't allow strings to contain NUL. If a corrupted string ++-- contains NUL at a character boundary position, some functions treat it as a ++-- character while others treat it as a terminator, as implementation details. ++ ++-- NUL = terminator ++SELECT length(with_nul) FROM regress_encoding; ++SELECT substring(with_nul, 3, 1) FROM regress_encoding; ++SELECT substring(with_nul, 4, 1) FROM regress_encoding; ++SELECT substring(with_nul, 5, 1) FROM regress_encoding; ++SELECT convert_to(substring(with_nul, 5, 1), 'UTF8') FROM regress_encoding; ++SELECT regexp_replace(with_nul, '^caf(.)$', '\1') FROM regress_encoding; ++-- NUL = character ++SELECT with_nul, reverse(with_nul), reverse(reverse(with_nul)) FROM regress_encoding; ++ ++-- If a corrupted string contains NUL in the tail bytes of a multibyte ++-- character (invalid in all encodings), it is considered part of the ++-- character for length purposes. An error will only be raised in code paths ++-- that convert or verify encodings. ++ ++SELECT length(truncated_with_nul) FROM regress_encoding; ++SELECT substring(truncated_with_nul, 3, 1) FROM regress_encoding; ++SELECT substring(truncated_with_nul, 4, 1) FROM regress_encoding; ++SELECT convert_to(substring(truncated_with_nul, 4, 1), 'UTF8') FROM regress_encoding; ++SELECT substring(truncated_with_nul, 5, 1) FROM regress_encoding; ++SELECT regexp_replace(truncated_with_nul, '^caf(.)dcba$', '\1') = test_bytea_to_text('\xc300') FROM regress_encoding; ++SELECT reverse(truncated_with_nul) FROM regress_encoding; ++ ++-- unbounded: sequence would overrun the string! ++SELECT test_mblen_func('pg_mblen_unbounded', 'UTF8', truncated, 3) ++FROM regress_encoding; ++ ++-- condition detected when using the length/range variants ++SELECT test_mblen_func('pg_mblen_with_len', 'UTF8', truncated, 3) ++FROM regress_encoding; ++SELECT test_mblen_func('pg_mblen_range', 'UTF8', truncated, 3) ++FROM regress_encoding; ++ ++-- unbounded: sequence would overrun the string, if the terminator were really ++-- the end of it ++SELECT test_mblen_func('pg_mblen_unbounded', 'UTF8', truncated_with_nul, 3) ++FROM regress_encoding; ++SELECT test_mblen_func('pg_encoding_mblen', 'GB18030', truncated_with_nul, 3) ++FROM regress_encoding; ++ ++-- condition detected when using the cstr variants ++SELECT test_mblen_func('pg_mblen_cstr', 'UTF8', truncated_with_nul, 3) ++FROM regress_encoding; ++ ++DROP TABLE regress_encoding; ++ ++-- mb<->wchar conversions ++CREATE FUNCTION test_encoding(encoding text, description text, input bytea) ++RETURNS VOID LANGUAGE plpgsql AS ++$$ ++DECLARE ++ prefix text; ++ len int; ++ wchars int[]; ++ round_trip bytea; ++ result text; ++BEGIN ++ prefix := rpad(encoding || ' ' || description || ':', 28); ++ ++ -- XXX could also test validation, length functions and include client ++ -- only encodings with these test cases ++ ++ IF test_valid_server_encoding(encoding) THEN ++ wchars := test_text_to_wchars(encoding, test_bytea_to_text(input)); ++ round_trip = test_text_to_bytea(test_wchars_to_text(encoding, wchars)); ++ if input = round_trip then ++ result := 'OK'; ++ elsif length(input) > length(round_trip) and round_trip = substr(input, 1, length(round_trip)) then ++ result := 'truncated'; ++ else ++ result := 'failed'; ++ end if; ++ RAISE NOTICE '% % -> % -> % = %', prefix, input, wchars, round_trip, result; ++ END IF; ++END; ++$$; ++-- No validation is done on the encoding itself, just the length to avoid ++-- overruns, so some of the byte sequences below are bogus. They cover ++-- all code branches, server encodings only for now. ++CREATE TABLE encoding_tests (encoding text, description text, input bytea); ++INSERT INTO encoding_tests VALUES ++ -- LATIN1, other single-byte encodings ++ ('LATIN1', 'ASCII', 'a'), ++ ('LATIN1', 'extended', '\xe9'), ++ -- EUC_JP, EUC_JIS_2004, EUR_KR (for the purposes of wchar conversion): ++ -- 2 8e (CS2, not used by EUR_KR but arbitrarily considered to have EUC_JP length) ++ -- 3 8f (CS3, not used by EUR_KR but arbitrarily considered to have EUC_JP length) ++ -- 2 80..ff (CS1) ++ ('EUC_JP', 'ASCII', 'a'), ++ ('EUC_JP', 'CS1, short', '\x80'), ++ ('EUC_JP', 'CS1', '\x8002'), ++ ('EUC_JP', 'CS2, short', '\x8e'), ++ ('EUC_JP', 'CS2', '\x8e02'), ++ ('EUC_JP', 'CS3, short', '\x8f'), ++ ('EUC_JP', 'CS3, short', '\x8f02'), ++ ('EUC_JP', 'CS3', '\x8f0203'), ++ -- EUC_CN ++ -- 3 8e (CS2, not used but arbitrarily considered to have length 3) ++ -- 3 8f (CS3, not used but arbitrarily considered to have length 3) ++ -- 2 80..ff (CS1) ++ ('EUC_CN', 'ASCII', 'a'), ++ ('EUC_CN', 'CS1, short', '\x80'), ++ ('EUC_CN', 'CS1', '\x8002'), ++ ('EUC_CN', 'CS2, short', '\x8e'), ++ ('EUC_CN', 'CS2, short', '\x8e02'), ++ ('EUC_CN', 'CS2', '\x8e0203'), ++ ('EUC_CN', 'CS3, short', '\x8f'), ++ ('EUC_CN', 'CS3, short', '\x8f02'), ++ ('EUC_CN', 'CS3', '\x8f0203'), ++ -- EUC_TW: ++ -- 4 8e (CS2) ++ -- 3 8f (CS3, not used but arbitrarily considered to have length 3) ++ -- 2 80..ff (CS1) ++ ('EUC_TW', 'ASCII', 'a'), ++ ('EUC_TW', 'CS1, short', '\x80'), ++ ('EUC_TW', 'CS1', '\x8002'), ++ ('EUC_TW', 'CS2, short', '\x8e'), ++ ('EUC_TW', 'CS2, short', '\x8e02'), ++ ('EUC_TW', 'CS2, short', '\x8e0203'), ++ ('EUC_TW', 'CS2', '\x8e020304'), ++ ('EUC_TW', 'CS3, short', '\x8f'), ++ ('EUC_TW', 'CS3, short', '\x8f02'), ++ ('EUC_TW', 'CS3', '\x8f0203'), ++ -- UTF8 ++ -- 2 c0..df ++ -- 3 e0..ef ++ -- 4 f0..f7 (but maximum real codepoint U+10ffff has f4) ++ -- 5 f8..fb (not supported) ++ -- 6 fc..fd (not supported) ++ ('UTF8', 'ASCII', 'a'), ++ ('UTF8', '2 byte, short', '\xdf'), ++ ('UTF8', '2 byte', '\xdf82'), ++ ('UTF8', '3 byte, short', '\xef'), ++ ('UTF8', '3 byte, short', '\xef82'), ++ ('UTF8', '3 byte', '\xef8283'), ++ ('UTF8', '4 byte, short', '\xf7'), ++ ('UTF8', '4 byte, short', '\xf782'), ++ ('UTF8', '4 byte, short', '\xf78283'), ++ ('UTF8', '4 byte', '\xf7828384'), ++ ('UTF8', '5 byte, unsupported', '\xfb'), ++ ('UTF8', '5 byte, unsupported', '\xfb82'), ++ ('UTF8', '5 byte, unsupported', '\xfb8283'), ++ ('UTF8', '5 byte, unsupported', '\xfb828384'), ++ ('UTF8', '5 byte, unsupported', '\xfb82838485'), ++ ('UTF8', '6 byte, unsupported', '\xfd'), ++ ('UTF8', '6 byte, unsupported', '\xfd82'), ++ ('UTF8', '6 byte, unsupported', '\xfd8283'), ++ ('UTF8', '6 byte, unsupported', '\xfd828384'), ++ ('UTF8', '6 byte, unsupported', '\xfd82838485'), ++ ('UTF8', '6 byte, unsupported', '\xfd8283848586'), ++ -- MULE_INTERNAL ++ -- 2 81..8d LC1 ++ -- 3 90..99 LC2 ++ ('MULE_INTERNAL', 'ASCII', 'a'), ++ ('MULE_INTERNAL', 'LC1, short', '\x81'), ++ ('MULE_INTERNAL', 'LC1', '\x8182'), ++ ('MULE_INTERNAL', 'LC2, short', '\x90'), ++ ('MULE_INTERNAL', 'LC2, short', '\x9082'), ++ ('MULE_INTERNAL', 'LC2', '\x908283'); ++ ++SELECT COUNT(test_encoding(encoding, description, input)) > 0 ++FROM encoding_tests; ++ ++-- substring fetches a slice of a toasted value; unused tail of that slice is ++-- an incomplete char (bug #19406) ++CREATE TABLE toast_3b_utf8 (c text); ++INSERT INTO toast_3b_utf8 VALUES (repeat(U&'\2026', 4000)); ++SELECT SUBSTRING(c FROM 1 FOR 1) FROM toast_3b_utf8; ++SELECT SUBSTRING(c FROM 4001 FOR 1) FROM toast_3b_utf8; ++-- diagnose incomplete char iff within the substring ++UPDATE toast_3b_utf8 SET c = c || test_bytea_to_text('\xe280'); ++SELECT SUBSTRING(c FROM 4000 FOR 1) FROM toast_3b_utf8; ++SELECT SUBSTRING(c FROM 4001 FOR 1) FROM toast_3b_utf8; ++-- substring needing last byte of its slice_size ++ALTER TABLE toast_3b_utf8 RENAME TO toast_4b_utf8; ++UPDATE toast_4b_utf8 SET c = repeat(U&'\+01F680', 3000); ++SELECT SUBSTRING(c FROM 3000 FOR 1) FROM toast_4b_utf8; ++ ++DROP TABLE encoding_tests; ++DROP TABLE toast_4b_utf8; ++DROP FUNCTION test_encoding; ++DROP FUNCTION test_text_to_wchars; ++DROP FUNCTION test_mblen_func; ++DROP FUNCTION test_bytea_to_text; ++DROP FUNCTION test_text_to_bytea; ++ ++ ++-- substring slow path: multi-byte escape char vs. multi-byte pattern char. ++SELECT SUBSTRING('a' SIMILAR U&'\00AC' ESCAPE U&'\00A7'); ++-- Levenshtein distance metric: exercise character length cache. ++SELECT U&"real\00A7_name" FROM (select 1) AS x(real_name); ++-- JSON errcontext: truncate long data. ++SELECT repeat(U&'\00A7', 30)::json; +diff --git a/src/test/regress/output/encoding.source b/src/test/regress/output/encoding.source +new file mode 100644 +index 00000000000..63c785f4d1b +--- /dev/null ++++ b/src/test/regress/output/encoding.source +@@ -0,0 +1,438 @@ ++/* skip test if not UTF8 server encoding */ ++SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset ++\if :skip_test ++\quit ++\endif ++CREATE FUNCTION test_bytea_to_text(bytea) RETURNS text ++ AS '@libdir@/regress@DLSUFFIX@' LANGUAGE C STRICT; ++CREATE FUNCTION test_text_to_bytea(text) RETURNS bytea ++ AS '@libdir@/regress@DLSUFFIX@' LANGUAGE C STRICT; ++CREATE FUNCTION test_mblen_func(text, text, text, int) RETURNS int ++ AS '@libdir@/regress@DLSUFFIX@' LANGUAGE C STRICT; ++CREATE FUNCTION test_text_to_wchars(text, text) RETURNS int[] ++ AS '@libdir@/regress@DLSUFFIX@' LANGUAGE C STRICT; ++CREATE FUNCTION test_wchars_to_text(text, int[]) RETURNS text ++ AS '@libdir@/regress@DLSUFFIX@' LANGUAGE C STRICT; ++CREATE FUNCTION test_valid_server_encoding(text) RETURNS boolean ++ AS '@libdir@/regress@DLSUFFIX@' LANGUAGE C STRICT; ++CREATE TABLE regress_encoding(good text, truncated text, with_nul text, truncated_with_nul text); ++INSERT INTO regress_encoding ++VALUES ('café', ++ 'caf' || test_bytea_to_text('\xc3'), ++ 'café' || test_bytea_to_text('\x00') || 'dcba', ++ 'caf' || test_bytea_to_text('\xc300') || 'dcba'); ++SELECT good, truncated, with_nul FROM regress_encoding; ++ good | truncated | with_nul ++------+-----------+---------- ++ café | caf | café ++(1 row) ++ ++SELECT length(good) FROM regress_encoding; ++ length ++-------- ++ 4 ++(1 row) ++ ++SELECT substring(good, 3, 1) FROM regress_encoding; ++ substring ++----------- ++ f ++(1 row) ++ ++SELECT substring(good, 4, 1) FROM regress_encoding; ++ substring ++----------- ++ é ++(1 row) ++ ++SELECT regexp_replace(good, '^caf(.)$', '\1') FROM regress_encoding; ++ regexp_replace ++---------------- ++ é ++(1 row) ++ ++SELECT reverse(good) FROM regress_encoding; ++ reverse ++--------- ++ éfac ++(1 row) ++ ++-- invalid short mb character = error ++SELECT length(truncated) FROM regress_encoding; ++ERROR: invalid byte sequence for encoding "UTF8": 0xc3 ++SELECT substring(truncated, 1, 3) FROM regress_encoding; ++ substring ++----------- ++ caf ++(1 row) ++ ++SELECT substring(truncated, 1, 4) FROM regress_encoding; ++ERROR: invalid byte sequence for encoding "UTF8": 0xc3 ++SELECT reverse(truncated) FROM regress_encoding; ++ERROR: invalid byte sequence for encoding "UTF8": 0xc3 ++-- invalid short mb character = silently dropped ++SELECT regexp_replace(truncated, '^caf(.)$', '\1') FROM regress_encoding; ++ regexp_replace ++---------------- ++ caf ++(1 row) ++ ++-- PostgreSQL doesn't allow strings to contain NUL. If a corrupted string ++-- contains NUL at a character boundary position, some functions treat it as a ++-- character while others treat it as a terminator, as implementation details. ++-- NUL = terminator ++SELECT length(with_nul) FROM regress_encoding; ++ length ++-------- ++ 4 ++(1 row) ++ ++SELECT substring(with_nul, 3, 1) FROM regress_encoding; ++ substring ++----------- ++ f ++(1 row) ++ ++SELECT substring(with_nul, 4, 1) FROM regress_encoding; ++ substring ++----------- ++ é ++(1 row) ++ ++SELECT substring(with_nul, 5, 1) FROM regress_encoding; ++ substring ++----------- ++ ++(1 row) ++ ++SELECT convert_to(substring(with_nul, 5, 1), 'UTF8') FROM regress_encoding; ++ convert_to ++------------ ++ \x ++(1 row) ++ ++SELECT regexp_replace(with_nul, '^caf(.)$', '\1') FROM regress_encoding; ++ regexp_replace ++---------------- ++ é ++(1 row) ++ ++-- NUL = character ++SELECT with_nul, reverse(with_nul), reverse(reverse(with_nul)) FROM regress_encoding; ++ with_nul | reverse | reverse ++----------+---------+--------- ++ café | abcd | café ++(1 row) ++ ++-- If a corrupted string contains NUL in the tail bytes of a multibyte ++-- character (invalid in all encodings), it is considered part of the ++-- character for length purposes. An error will only be raised in code paths ++-- that convert or verify encodings. ++SELECT length(truncated_with_nul) FROM regress_encoding; ++ length ++-------- ++ 8 ++(1 row) ++ ++SELECT substring(truncated_with_nul, 3, 1) FROM regress_encoding; ++ substring ++----------- ++ f ++(1 row) ++ ++SELECT substring(truncated_with_nul, 4, 1) FROM regress_encoding; ++ substring ++----------- ++ ++(1 row) ++ ++SELECT convert_to(substring(truncated_with_nul, 4, 1), 'UTF8') FROM regress_encoding; ++ERROR: invalid byte sequence for encoding "UTF8": 0xc3 0x00 ++SELECT substring(truncated_with_nul, 5, 1) FROM regress_encoding; ++ substring ++----------- ++ d ++(1 row) ++ ++SELECT regexp_replace(truncated_with_nul, '^caf(.)dcba$', '\1') = test_bytea_to_text('\xc300') FROM regress_encoding; ++ ?column? ++---------- ++ t ++(1 row) ++ ++SELECT reverse(truncated_with_nul) FROM regress_encoding; ++ reverse ++--------- ++ abcd ++(1 row) ++ ++-- unbounded: sequence would overrun the string! ++SELECT test_mblen_func('pg_mblen_unbounded', 'UTF8', truncated, 3) ++FROM regress_encoding; ++ test_mblen_func ++----------------- ++ 2 ++(1 row) ++ ++-- condition detected when using the length/range variants ++SELECT test_mblen_func('pg_mblen_with_len', 'UTF8', truncated, 3) ++FROM regress_encoding; ++ERROR: invalid byte sequence for encoding "UTF8": 0xc3 ++SELECT test_mblen_func('pg_mblen_range', 'UTF8', truncated, 3) ++FROM regress_encoding; ++ERROR: invalid byte sequence for encoding "UTF8": 0xc3 ++-- unbounded: sequence would overrun the string, if the terminator were really ++-- the end of it ++SELECT test_mblen_func('pg_mblen_unbounded', 'UTF8', truncated_with_nul, 3) ++FROM regress_encoding; ++ test_mblen_func ++----------------- ++ 2 ++(1 row) ++ ++SELECT test_mblen_func('pg_encoding_mblen', 'GB18030', truncated_with_nul, 3) ++FROM regress_encoding; ++ test_mblen_func ++----------------- ++ 2 ++(1 row) ++ ++-- condition detected when using the cstr variants ++SELECT test_mblen_func('pg_mblen_cstr', 'UTF8', truncated_with_nul, 3) ++FROM regress_encoding; ++ERROR: invalid byte sequence for encoding "UTF8": 0xc3 ++DROP TABLE regress_encoding; ++-- mb<->wchar conversions ++CREATE FUNCTION test_encoding(encoding text, description text, input bytea) ++RETURNS VOID LANGUAGE plpgsql AS ++$$ ++DECLARE ++ prefix text; ++ len int; ++ wchars int[]; ++ round_trip bytea; ++ result text; ++BEGIN ++ prefix := rpad(encoding || ' ' || description || ':', 28); ++ ++ -- XXX could also test validation, length functions and include client ++ -- only encodings with these test cases ++ ++ IF test_valid_server_encoding(encoding) THEN ++ wchars := test_text_to_wchars(encoding, test_bytea_to_text(input)); ++ round_trip = test_text_to_bytea(test_wchars_to_text(encoding, wchars)); ++ if input = round_trip then ++ result := 'OK'; ++ elsif length(input) > length(round_trip) and round_trip = substr(input, 1, length(round_trip)) then ++ result := 'truncated'; ++ else ++ result := 'failed'; ++ end if; ++ RAISE NOTICE '% % -> % -> % = %', prefix, input, wchars, round_trip, result; ++ END IF; ++END; ++$$; ++-- No validation is done on the encoding itself, just the length to avoid ++-- overruns, so some of the byte sequences below are bogus. They cover ++-- all code branches, server encodings only for now. ++CREATE TABLE encoding_tests (encoding text, description text, input bytea); ++INSERT INTO encoding_tests VALUES ++ -- LATIN1, other single-byte encodings ++ ('LATIN1', 'ASCII', 'a'), ++ ('LATIN1', 'extended', '\xe9'), ++ -- EUC_JP, EUC_JIS_2004, EUR_KR (for the purposes of wchar conversion): ++ -- 2 8e (CS2, not used by EUR_KR but arbitrarily considered to have EUC_JP length) ++ -- 3 8f (CS3, not used by EUR_KR but arbitrarily considered to have EUC_JP length) ++ -- 2 80..ff (CS1) ++ ('EUC_JP', 'ASCII', 'a'), ++ ('EUC_JP', 'CS1, short', '\x80'), ++ ('EUC_JP', 'CS1', '\x8002'), ++ ('EUC_JP', 'CS2, short', '\x8e'), ++ ('EUC_JP', 'CS2', '\x8e02'), ++ ('EUC_JP', 'CS3, short', '\x8f'), ++ ('EUC_JP', 'CS3, short', '\x8f02'), ++ ('EUC_JP', 'CS3', '\x8f0203'), ++ -- EUC_CN ++ -- 3 8e (CS2, not used but arbitrarily considered to have length 3) ++ -- 3 8f (CS3, not used but arbitrarily considered to have length 3) ++ -- 2 80..ff (CS1) ++ ('EUC_CN', 'ASCII', 'a'), ++ ('EUC_CN', 'CS1, short', '\x80'), ++ ('EUC_CN', 'CS1', '\x8002'), ++ ('EUC_CN', 'CS2, short', '\x8e'), ++ ('EUC_CN', 'CS2, short', '\x8e02'), ++ ('EUC_CN', 'CS2', '\x8e0203'), ++ ('EUC_CN', 'CS3, short', '\x8f'), ++ ('EUC_CN', 'CS3, short', '\x8f02'), ++ ('EUC_CN', 'CS3', '\x8f0203'), ++ -- EUC_TW: ++ -- 4 8e (CS2) ++ -- 3 8f (CS3, not used but arbitrarily considered to have length 3) ++ -- 2 80..ff (CS1) ++ ('EUC_TW', 'ASCII', 'a'), ++ ('EUC_TW', 'CS1, short', '\x80'), ++ ('EUC_TW', 'CS1', '\x8002'), ++ ('EUC_TW', 'CS2, short', '\x8e'), ++ ('EUC_TW', 'CS2, short', '\x8e02'), ++ ('EUC_TW', 'CS2, short', '\x8e0203'), ++ ('EUC_TW', 'CS2', '\x8e020304'), ++ ('EUC_TW', 'CS3, short', '\x8f'), ++ ('EUC_TW', 'CS3, short', '\x8f02'), ++ ('EUC_TW', 'CS3', '\x8f0203'), ++ -- UTF8 ++ -- 2 c0..df ++ -- 3 e0..ef ++ -- 4 f0..f7 (but maximum real codepoint U+10ffff has f4) ++ -- 5 f8..fb (not supported) ++ -- 6 fc..fd (not supported) ++ ('UTF8', 'ASCII', 'a'), ++ ('UTF8', '2 byte, short', '\xdf'), ++ ('UTF8', '2 byte', '\xdf82'), ++ ('UTF8', '3 byte, short', '\xef'), ++ ('UTF8', '3 byte, short', '\xef82'), ++ ('UTF8', '3 byte', '\xef8283'), ++ ('UTF8', '4 byte, short', '\xf7'), ++ ('UTF8', '4 byte, short', '\xf782'), ++ ('UTF8', '4 byte, short', '\xf78283'), ++ ('UTF8', '4 byte', '\xf7828384'), ++ ('UTF8', '5 byte, unsupported', '\xfb'), ++ ('UTF8', '5 byte, unsupported', '\xfb82'), ++ ('UTF8', '5 byte, unsupported', '\xfb8283'), ++ ('UTF8', '5 byte, unsupported', '\xfb828384'), ++ ('UTF8', '5 byte, unsupported', '\xfb82838485'), ++ ('UTF8', '6 byte, unsupported', '\xfd'), ++ ('UTF8', '6 byte, unsupported', '\xfd82'), ++ ('UTF8', '6 byte, unsupported', '\xfd8283'), ++ ('UTF8', '6 byte, unsupported', '\xfd828384'), ++ ('UTF8', '6 byte, unsupported', '\xfd82838485'), ++ ('UTF8', '6 byte, unsupported', '\xfd8283848586'), ++ -- MULE_INTERNAL ++ -- 2 81..8d LC1 ++ -- 3 90..99 LC2 ++ ('MULE_INTERNAL', 'ASCII', 'a'), ++ ('MULE_INTERNAL', 'LC1, short', '\x81'), ++ ('MULE_INTERNAL', 'LC1', '\x8182'), ++ ('MULE_INTERNAL', 'LC2, short', '\x90'), ++ ('MULE_INTERNAL', 'LC2, short', '\x9082'), ++ ('MULE_INTERNAL', 'LC2', '\x908283'); ++SELECT COUNT(test_encoding(encoding, description, input)) > 0 ++FROM encoding_tests; ++NOTICE: LATIN1 ASCII: \x61 -> {97} -> \x61 = OK ++NOTICE: LATIN1 extended: \xe9 -> {233} -> \xe9 = OK ++NOTICE: EUC_JP ASCII: \x61 -> {97} -> \x61 = OK ++NOTICE: EUC_JP CS1, short: \x80 -> {128} -> \x80 = OK ++NOTICE: EUC_JP CS1: \x8002 -> {32770} -> \x8002 = OK ++NOTICE: EUC_JP CS2, short: \x8e -> {142} -> \x8e = OK ++NOTICE: EUC_JP CS2: \x8e02 -> {36354} -> \x8e02 = OK ++NOTICE: EUC_JP CS3, short: \x8f -> {143} -> \x8f = OK ++NOTICE: EUC_JP CS3, short: \x8f02 -> {36610} -> \x8f02 = OK ++NOTICE: EUC_JP CS3: \x8f0203 -> {9372163} -> \x8f0203 = OK ++NOTICE: EUC_CN ASCII: \x61 -> {97} -> \x61 = OK ++NOTICE: EUC_CN CS1, short: \x80 -> {128} -> \x80 = OK ++NOTICE: EUC_CN CS1: \x8002 -> {32770} -> \x8002 = OK ++NOTICE: EUC_CN CS2, short: \x8e -> {142} -> \x8e = OK ++NOTICE: EUC_CN CS2, short: \x8e02 -> {36354} -> \x8e02 = OK ++NOTICE: EUC_CN CS2: \x8e0203 -> {9306627} -> \x8e0203 = OK ++NOTICE: EUC_CN CS3, short: \x8f -> {143} -> \x8f = OK ++NOTICE: EUC_CN CS3, short: \x8f02 -> {36610} -> \x8f02 = OK ++NOTICE: EUC_CN CS3: \x8f0203 -> {9372163} -> \x8f0203 = OK ++NOTICE: EUC_TW ASCII: \x61 -> {97} -> \x61 = OK ++NOTICE: EUC_TW CS1, short: \x80 -> {128} -> \x80 = OK ++NOTICE: EUC_TW CS1: \x8002 -> {32770} -> \x8002 = OK ++NOTICE: EUC_TW CS2, short: \x8e -> {142} -> \x8e = OK ++NOTICE: EUC_TW CS2, short: \x8e02 -> {36354} -> \x8e02 = OK ++NOTICE: EUC_TW CS2, short: \x8e0203 -> {36354,3} -> \x8e0203 = OK ++NOTICE: EUC_TW CS2: \x8e020304 -> {-1912470780} -> \x8e020304 = OK ++NOTICE: EUC_TW CS3, short: \x8f -> {143} -> \x8f = OK ++NOTICE: EUC_TW CS3, short: \x8f02 -> {36610} -> \x8f02 = OK ++NOTICE: EUC_TW CS3: \x8f0203 -> {9372163} -> \x8f0203 = OK ++NOTICE: UTF8 ASCII: \x61 -> {97} -> \x61 = OK ++NOTICE: UTF8 2 byte, short: \xdf -> {} -> \x = truncated ++NOTICE: UTF8 2 byte: \xdf82 -> {1986} -> \xdf82 = OK ++NOTICE: UTF8 3 byte, short: \xef -> {} -> \x = truncated ++NOTICE: UTF8 3 byte, short: \xef82 -> {} -> \x = truncated ++NOTICE: UTF8 3 byte: \xef8283 -> {61571} -> \xef8283 = OK ++NOTICE: UTF8 4 byte, short: \xf7 -> {} -> \x = truncated ++NOTICE: UTF8 4 byte, short: \xf782 -> {} -> \x = truncated ++NOTICE: UTF8 4 byte, short: \xf78283 -> {} -> \x = truncated ++NOTICE: UTF8 4 byte: \xf7828384 -> {1843396} -> \xf7828384 = OK ++NOTICE: UTF8 5 byte, unsupported: \xfb -> {251} -> \xc3bb = failed ++NOTICE: UTF8 5 byte, unsupported: \xfb82 -> {251,130} -> \xc3bbc282 = failed ++NOTICE: UTF8 5 byte, unsupported: \xfb8283 -> {251,130,131} -> \xc3bbc282c283 = failed ++NOTICE: UTF8 5 byte, unsupported: \xfb828384 -> {251,130,131,132} -> \xc3bbc282c283c284 = failed ++NOTICE: UTF8 5 byte, unsupported: \xfb82838485 -> {251,130,131,132,133} -> \xc3bbc282c283c284c285 = failed ++NOTICE: UTF8 6 byte, unsupported: \xfd -> {253} -> \xc3bd = failed ++NOTICE: UTF8 6 byte, unsupported: \xfd82 -> {253,130} -> \xc3bdc282 = failed ++NOTICE: UTF8 6 byte, unsupported: \xfd8283 -> {253,130,131} -> \xc3bdc282c283 = failed ++NOTICE: UTF8 6 byte, unsupported: \xfd828384 -> {253,130,131,132} -> \xc3bdc282c283c284 = failed ++NOTICE: UTF8 6 byte, unsupported: \xfd82838485 -> {253,130,131,132,133} -> \xc3bdc282c283c284c285 = failed ++NOTICE: UTF8 6 byte, unsupported: \xfd8283848586 -> {253,130,131,132,133,134} -> \xc3bdc282c283c284c285c286 = failed ++NOTICE: MULE_INTERNAL ASCII: \x61 -> {97} -> \x61 = OK ++NOTICE: MULE_INTERNAL LC1, short: \x81 -> {129} -> \x81 = OK ++NOTICE: MULE_INTERNAL LC1: \x8182 -> {8454274} -> \x8182 = OK ++NOTICE: MULE_INTERNAL LC2, short: \x90 -> {144} -> \x90 = OK ++NOTICE: MULE_INTERNAL LC2, short: \x9082 -> {144,130} -> \x9082 = OK ++NOTICE: MULE_INTERNAL LC2: \x908283 -> {9470595} -> \x908283 = OK ++ ?column? ++---------- ++ t ++(1 row) ++ ++-- substring fetches a slice of a toasted value; unused tail of that slice is ++-- an incomplete char (bug #19406) ++CREATE TABLE toast_3b_utf8 (c text); ++INSERT INTO toast_3b_utf8 VALUES (repeat(U&'\2026', 4000)); ++SELECT SUBSTRING(c FROM 1 FOR 1) FROM toast_3b_utf8; ++ substring ++----------- ++ … ++(1 row) ++ ++SELECT SUBSTRING(c FROM 4001 FOR 1) FROM toast_3b_utf8; ++ substring ++----------- ++ ++(1 row) ++ ++-- diagnose incomplete char iff within the substring ++UPDATE toast_3b_utf8 SET c = c || test_bytea_to_text('\xe280'); ++SELECT SUBSTRING(c FROM 4000 FOR 1) FROM toast_3b_utf8; ++ substring ++----------- ++ … ++(1 row) ++ ++SELECT SUBSTRING(c FROM 4001 FOR 1) FROM toast_3b_utf8; ++ERROR: invalid byte sequence for encoding "UTF8": 0xe2 0x80 ++-- substring needing last byte of its slice_size ++ALTER TABLE toast_3b_utf8 RENAME TO toast_4b_utf8; ++UPDATE toast_4b_utf8 SET c = repeat(U&'\+01F680', 3000); ++SELECT SUBSTRING(c FROM 3000 FOR 1) FROM toast_4b_utf8; ++ substring ++----------- ++ 🚀 ++(1 row) ++ ++DROP TABLE encoding_tests; ++DROP TABLE toast_4b_utf8; ++DROP FUNCTION test_encoding; ++DROP FUNCTION test_text_to_wchars; ++DROP FUNCTION test_mblen_func; ++DROP FUNCTION test_bytea_to_text; ++DROP FUNCTION test_text_to_bytea; ++-- substring slow path: multi-byte escape char vs. multi-byte pattern char. ++SELECT SUBSTRING('a' SIMILAR U&'\00AC' ESCAPE U&'\00A7'); ++ERROR: syntax error at or near "U&'\00AC'" ++LINE 1: SELECT SUBSTRING('a' SIMILAR U&'\00AC' ESCAPE U&'\00A7'); ++ ^ ++-- Levenshtein distance metric: exercise character length cache. ++SELECT U&"real\00A7_name" FROM (select 1) AS x(real_name); ++ERROR: column "real§_name" does not exist ++LINE 1: SELECT U&"real\00A7_name" FROM (select 1) AS x(real_name); ++ ^ ++HINT: Perhaps you meant to reference the column "x.real_name". ++-- JSON errcontext: truncate long data. ++SELECT repeat(U&'\00A7', 30)::json; ++ERROR: invalid input syntax for type json ++DETAIL: Token "§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§" is invalid. ++CONTEXT: JSON data, line 1: ...§§§§§§§§§§§§§§§§§§§§§§§§ +diff --git a/src/test/regress/output/encoding_1.source b/src/test/regress/output/encoding_1.source +new file mode 100644 +index 00000000000..a5b02090901 +--- /dev/null ++++ b/src/test/regress/output/encoding_1.source +@@ -0,0 +1,4 @@ ++/* skip test if not UTF8 server encoding */ ++SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset ++\if :skip_test ++\quit +diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule +index e146b24bef3..bb4019c8039 100644 +--- a/src/test/regress/parallel_schedule ++++ b/src/test/regress/parallel_schedule +@@ -27,7 +27,7 @@ test: strings numerology point lseg line box path polygon circle date time timet + # geometry depends on point, lseg, box, path, polygon and circle + # horology depends on interval, timetz, timestamp, timestamptz + # ---------- +-test: geometry horology regex oidjoins type_sanity opr_sanity misc_sanity comments expressions unicode database ++test: geometry horology tstypes regex type_sanity opr_sanity misc_sanity comments expressions unicode xid database encoding euc_kr + + # ---------- + # These four each depend on the previous one +diff --git a/src/test/regress/regress.c b/src/test/regress/regress.c +index ed75c410ffe..bcda74e9c37 100644 +--- a/src/test/regress/regress.c ++++ b/src/test/regress/regress.c +@@ -1141,3 +1141,143 @@ test_enc_setup(PG_FUNCTION_ARGS) + + PG_RETURN_VOID(); + } ++ ++/* Convert bytea to text without validation for corruption tests from SQL. */ ++PG_FUNCTION_INFO_V1(test_bytea_to_text); ++Datum ++test_bytea_to_text(PG_FUNCTION_ARGS) ++{ ++ PG_RETURN_TEXT_P(PG_GETARG_BYTEA_PP(0)); ++} ++ ++/* And the reverse. */ ++PG_FUNCTION_INFO_V1(test_text_to_bytea); ++Datum ++test_text_to_bytea(PG_FUNCTION_ARGS) ++{ ++ PG_RETURN_BYTEA_P(PG_GETARG_TEXT_PP(0)); ++} ++ ++/* Corruption tests in C. */ ++PG_FUNCTION_INFO_V1(test_mblen_func); ++Datum ++test_mblen_func(PG_FUNCTION_ARGS) ++{ ++ const char *func = text_to_cstring(PG_GETARG_BYTEA_PP(0)); ++ const char *encoding = text_to_cstring(PG_GETARG_BYTEA_PP(1)); ++ text *string = PG_GETARG_BYTEA_PP(2); ++ int offset = PG_GETARG_INT32(3); ++ const char *data = VARDATA_ANY(string); ++ size_t size = VARSIZE_ANY_EXHDR(string); ++ int result = 0; ++ ++ if (strcmp(func, "pg_mblen_unbounded") == 0) ++ result = pg_mblen_unbounded(data + offset); ++ else if (strcmp(func, "pg_mblen_cstr") == 0) ++ result = pg_mblen_cstr(data + offset); ++ else if (strcmp(func, "pg_mblen_with_len") == 0) ++ result = pg_mblen_with_len(data + offset, size - offset); ++ else if (strcmp(func, "pg_mblen_range") == 0) ++ result = pg_mblen_range(data + offset, data + size); ++ else if (strcmp(func, "pg_encoding_mblen") == 0) ++ result = pg_encoding_mblen(pg_char_to_encoding(encoding), data + offset); ++ else ++ elog(ERROR, "unknown function"); ++ ++ PG_RETURN_INT32(result); ++} ++ ++PG_FUNCTION_INFO_V1(test_text_to_wchars); ++Datum ++test_text_to_wchars(PG_FUNCTION_ARGS) ++{ ++ const char *encoding_name = text_to_cstring(PG_GETARG_BYTEA_PP(0)); ++ text *string = PG_GETARG_TEXT_PP(1); ++ const char *data = VARDATA_ANY(string); ++ size_t size = VARSIZE_ANY_EXHDR(string); ++ pg_wchar *wchars = palloc(sizeof(pg_wchar) * (size + 1)); ++ Datum *datums; ++ int wlen; ++ int encoding; ++ ++ encoding = pg_char_to_encoding(encoding_name); ++ if (encoding < 0) ++ elog(ERROR, "unknown encoding name: %s", encoding_name); ++ ++ if (size > 0) ++ { ++ datums = palloc(sizeof(Datum) * size); ++ wlen = pg_encoding_mb2wchar_with_len(encoding, ++ data, ++ wchars, ++ size); ++ Assert(wlen >= 0); ++ Assert(wlen <= size); ++ Assert(wchars[wlen] == 0); ++ ++ for (int i = 0; i < wlen; ++i) ++ datums[i] = UInt32GetDatum(wchars[i]); ++ } ++ else ++ { ++ datums = NULL; ++ wlen = 0; ++ } ++ ++ PG_RETURN_ARRAYTYPE_P(construct_array_builtin(datums, wlen, INT4OID)); ++} ++ ++PG_FUNCTION_INFO_V1(test_wchars_to_text); ++Datum ++test_wchars_to_text(PG_FUNCTION_ARGS) ++{ ++ const char *encoding_name = text_to_cstring(PG_GETARG_BYTEA_PP(0)); ++ ArrayType *array = PG_GETARG_ARRAYTYPE_P(1); ++ Datum *datums; ++ bool *nulls; ++ char *mb; ++ text *result; ++ int wlen; ++ int bytes; ++ int encoding; ++ ++ encoding = pg_char_to_encoding(encoding_name); ++ if (encoding < 0) ++ elog(ERROR, "unknown encoding name: %s", encoding_name); ++ ++ deconstruct_array_builtin(array, INT4OID, &datums, &nulls, &wlen); ++ ++ if (wlen > 0) ++ { ++ pg_wchar *wchars = palloc(sizeof(pg_wchar) * wlen); ++ ++ for (int i = 0; i < wlen; ++i) ++ { ++ if (nulls[i]) ++ elog(ERROR, "unexpected NULL in array"); ++ wchars[i] = DatumGetInt32(datums[i]); ++ } ++ ++ mb = palloc(pg_encoding_max_length(encoding) * wlen + 1); ++ bytes = pg_encoding_wchar2mb_with_len(encoding, wchars, mb, wlen); ++ } ++ else ++ { ++ mb = ""; ++ bytes = 0; ++ } ++ ++ result = palloc(bytes + VARHDRSZ); ++ SET_VARSIZE(result, bytes + VARHDRSZ); ++ memcpy(VARDATA(result), mb, bytes); ++ ++ PG_RETURN_TEXT_P(result); ++} ++ ++PG_FUNCTION_INFO_V1(test_valid_server_encoding); ++Datum ++test_valid_server_encoding(PG_FUNCTION_ARGS) ++{ ++ return pg_valid_server_encoding(text_to_cstring(PG_GETARG_TEXT_PP(0))); ++} ++ +diff --git a/src/test/regress/sql/.gitignore b/src/test/regress/sql/.gitignore +index 46c8112094c..f5a279c2ee4 100644 +--- a/src/test/regress/sql/.gitignore ++++ b/src/test/regress/sql/.gitignore +@@ -2,6 +2,7 @@ + /copy.sql + /create_function_1.sql + /create_function_2.sql ++/encoding.sql + /largeobject.sql + /misc.sql + /security_label.sql +diff --git a/src/test/regress/sql/euc_kr.sql b/src/test/regress/sql/euc_kr.sql +new file mode 100644 +index 00000000000..1851b2a8c14 +--- /dev/null ++++ b/src/test/regress/sql/euc_kr.sql +@@ -0,0 +1,12 @@ ++-- This test is about EUC_KR encoding, chosen as perhaps the most prevalent ++-- non-UTF8, multibyte encoding as of 2026-01. Since UTF8 can represent all ++-- of EUC_KR, also run the test in UTF8. ++SELECT getdatabaseencoding() NOT IN ('EUC_KR', 'UTF8') AS skip_test \gset ++\if :skip_test ++\quit ++\endif ++ ++-- Exercise is_multibyte_char_in_char (non-UTF8) slow path. ++SELECT POSITION( ++ convert_from('\xbcf6c7d0', 'EUC_KR') IN ++ convert_from('\xb0fac7d02c20bcf6c7d02c20b1e2bcfa2c20bbee', 'EUC_KR')); +diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list +index 4824097ead2..b8e20d0c4fe 100644 +--- a/src/tools/pgindent/typedefs.list ++++ b/src/tools/pgindent/typedefs.list +@@ -653,6 +653,7 @@ ExtensibleNodeMethods + ExtensionControlFile + ExtensionInfo + ExtensionMemberId ++ExtensionSiblingCache + ExtensionVersionInfo + FDWCollateState + FD_SET diff --git a/SPECS/postgresql.spec b/SPECS/postgresql.spec index 36cf789..4a2fda4 100644 --- a/SPECS/postgresql.spec +++ b/SPECS/postgresql.spec @@ -63,7 +63,7 @@ Summary: PostgreSQL client programs Name: postgresql %global majorversion 13 Version: %{majorversion}.23 -Release: 1%{?dist} +Release: 2%{?dist} # The PostgreSQL license is very similar to other MIT licenses, but the OSI # recognizes it as an independent license, so we do as well. @@ -111,6 +111,7 @@ Patch8: postgresql-external-libpq.patch Patch9: postgresql-server-pg_config.patch Patch12: postgresql-no-libecpg.patch Patch14: postgresql-pgcrypto-openssl3-tests.patch +Patch15: CVE-2026-2004--CVE-2026-2005--CVE-2026-2006.patch BuildRequires: make BuildRequires: gcc @@ -426,6 +427,7 @@ goal of accelerating analytics queries. %endif %patch -P 9 -p1 %patch -P 14 -p1 +%patch -P 15 -p1 # We used to run autoconf here, but there's no longer any real need to, # since Postgres ships with a reasonably modern configure script. @@ -1230,6 +1232,9 @@ make -C postgresql-setup-%{setup_version} check %changelog +* Wed Feb 25 2026 Filip Janus  - 13.23-2 +- fix CVE-2026-2004 CVE-2026-2005 CVE-2026-2006 + * Fri Dec 05 2025 Filip Janus - 13.23-1 - Update to 13.23 - Resolves: RHEL-128812 (CVE-2025-12818)