diff --git a/SOURCES/backport-cve-2025-1094.patch b/SOURCES/backport-cve-2025-1094.patch new file mode 100644 index 0000000..4db01d7 --- /dev/null +++ b/SOURCES/backport-cve-2025-1094.patch @@ -0,0 +1,3670 @@ +From 62235454d50a62138341a87be065e4681684753a Mon Sep 17 00:00:00 2001 +From: Andres Freund +Date: Mon, 10 Feb 2025 10:03:37 -0500 +Subject: [PATCH 1/8] Backport upstream commit + 4dc28963533704fc7dd922b9447467466a233d89 Add pg_encoding_set_invalid() + +There are cases where we cannot / do not want to error out for invalidly +encoded input. In such cases it can be useful to replace e.g. an incomplete +multi-byte characters with bytes that will trigger an error when getting +validated as part of a larger string. + +Unfortunately, until now, for some encoding no such sequence existed. For +those encodings this commit removes one previously accepted input combination +- we consider that to be ok, as the chosen bytes are outside of the valid +ranges for the encodings, we just previously failed to detect that. + +As we cannot add a new field to pg_wchar_table without breaking ABI, this is +implemented "in-line" in the newly added function. +--- + src/backend/utils/mb/wchar.c | 55 +++++++++++++++++++++++++++++++++++- + src/include/mb/pg_wchar.h | 1 + + 2 files changed, 55 insertions(+), 1 deletion(-) + +diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c +index 1b5ce1740c0..872241cc804 100644 +--- a/src/backend/utils/mb/wchar.c ++++ b/src/backend/utils/mb/wchar.c +@@ -14,6 +14,25 @@ + #include "mb/pg_wchar.h" + + ++/* ++ * In today's multibyte encodings other than UTF8, this two-byte sequence ++ * ensures pg_encoding_mblen() == 2 && pg_encoding_verifymbstr() == 0. ++ * ++ * For historical reasons, several verifychar implementations opt to reject ++ * this pair specifically. Byte pair range constraints, in encoding ++ * originator documentation, always excluded this pair. No core conversion ++ * could translate it. However, longstanding verifychar implementations ++ * accepted any non-NUL byte. big5_to_euc_tw and big5_to_mic even translate ++ * pairs not valid per encoding originator documentation. To avoid tightening ++ * core or non-core conversions in a security patch, we sought this one pair. ++ * ++ * PQescapeString() historically used spaces for BYTE1; many other values ++ * could suffice for BYTE1. ++ */ ++#define NONUTF8_INVALID_BYTE0 (0x8d) ++#define NONUTF8_INVALID_BYTE1 (' ') ++ ++ + /* + * Operations on multi-byte encodings are driven by a table of helper + * functions. +@@ -1394,6 +1413,11 @@ pg_big5_verifier(const unsigned char *s, int len) + if (len < l) + return -1; + ++ if (l == 2 && ++ s[0] == NONUTF8_INVALID_BYTE0 && ++ s[1] == NONUTF8_INVALID_BYTE1) ++ return -1; ++ + while (--l > 0) + { + if (*++s == '\0') +@@ -1414,6 +1438,11 @@ pg_gbk_verifier(const unsigned char *s, int len) + if (len < l) + return -1; + ++ if (l == 2 && ++ s[0] == NONUTF8_INVALID_BYTE0 && ++ s[1] == NONUTF8_INVALID_BYTE1) ++ return -1; ++ + while (--l > 0) + { + if (*++s == '\0') +@@ -1434,6 +1463,11 @@ pg_uhc_verifier(const unsigned char *s, int len) + if (len < l) + return -1; + ++ if (l == 2 && ++ s[0] == NONUTF8_INVALID_BYTE0 && ++ s[1] == NONUTF8_INVALID_BYTE1) ++ return -1; ++ + while (--l > 0) + { + if (*++s == '\0') +@@ -1768,6 +1802,19 @@ pg_eucjp_increment(unsigned char *charptr, int length) + #endif /* !FRONTEND */ + + ++/* ++ * Fills the provided buffer with two bytes such that: ++ * pg_encoding_mblen(dst) == 2 && pg_encoding_verifymbstr(dst) == 0 ++ */ ++void ++pg_encoding_set_invalid(int encoding, char *dst) ++{ ++ Assert(pg_encoding_max_length(encoding) > 1); ++ ++ dst[0] = (encoding == PG_UTF8 ? 0xc0 : NONUTF8_INVALID_BYTE0); ++ dst[1] = NONUTF8_INVALID_BYTE1; ++} ++ + /* + *------------------------------------------------------------------- + * encoding info table +@@ -1869,7 +1916,13 @@ pg_encoding_max_length(int encoding) + { + Assert(PG_VALID_ENCODING(encoding)); + +- return pg_wchar_table[encoding].maxmblen; ++ /* ++ * Check for the encoding despite the assert, due to some mingw versions ++ * otherwise issuing bogus warnings. ++ */ ++ return PG_VALID_ENCODING(encoding) ? ++ pg_wchar_table[encoding].maxmblen : ++ pg_wchar_table[PG_SQL_ASCII].maxmblen; + } + + #ifndef FRONTEND +diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h +index ec101a834ef..07b316fae1d 100644 +--- a/src/include/mb/pg_wchar.h ++++ b/src/include/mb/pg_wchar.h +@@ -520,6 +520,7 @@ extern int pg_valid_server_encoding_id(int encoding); + * Remaining functions are not considered part of libpq's API, though many + * of them do exist inside libpq. + */ ++extern void pg_encoding_set_invalid(int encoding, char *dst); + extern int pg_mb2wchar(const char *from, pg_wchar *to); + extern int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len); + extern int pg_encoding_mb2wchar_with_len(int encoding, +-- +2.39.5 (Apple Git-154) + + +From 581adbfe8c9db2e641705b308a74e5b6d89c61a6 Mon Sep 17 00:00:00 2001 +From: Andres Freund +Date: Mon, 10 Feb 2025 10:03:37 -0500 +Subject: [PATCH 2/8] Backport upstream + commit:3e98c8ce50e46d58b91bf3ea806e995296dc5b91 Specify the encoding of input + to fmtId() + +This commit adds fmtIdEnc() and fmtQualifiedIdEnc(), which allow to specify +the encoding as an explicit argument. Additionally setFmtEncoding() is +provided, which defines the encoding when no explicit encoding is provided, to +avoid breaking all code using fmtId(). + +All users of fmtId()/fmtQualifiedId() are either converted to the explicit +version or a call to setFmtEncoding() has been added. + +This commit does not yet utilize the now well-defined encoding, that will +happen in a subsequent commit. +--- + src/bin/pg_dump/pg_backup_archiver.c | 1 + + src/bin/pg_dump/pg_dump.c | 1 + + src/bin/pg_dump/pg_dumpall.c | 1 + + src/bin/psql/command.c | 3 + + src/bin/scripts/common.c | 5 +- + src/bin/scripts/createdb.c | 2 + + src/bin/scripts/createuser.c | 2 + + src/bin/scripts/dropdb.c | 8 ++- + src/bin/scripts/dropuser.c | 3 +- + src/bin/scripts/reindexdb.c | 4 +- + src/bin/scripts/vacuumdb.c | 5 +- + src/fe_utils/string_utils.c | 84 ++++++++++++++++++++++++++-- + src/include/fe_utils/string_utils.h | 5 +- + 13 files changed, 109 insertions(+), 15 deletions(-) + +diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c +index 6476f7119af..489a84aca3d 100644 +--- a/src/bin/pg_dump/pg_backup_archiver.c ++++ b/src/bin/pg_dump/pg_backup_archiver.c +@@ -2731,6 +2731,7 @@ processEncodingEntry(ArchiveHandle *AH, TocEntry *te) + fatal("unrecognized encoding \"%s\"", + ptr1); + AH->public.encoding = encoding; ++ setFmtEncoding(encoding); + } + else + fatal("invalid ENCODING item: %s", +diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c +index 67a3714c62c..53fc95f3033 100644 +--- a/src/bin/pg_dump/pg_dump.c ++++ b/src/bin/pg_dump/pg_dump.c +@@ -1085,6 +1085,7 @@ setup_connection(Archive *AH, const char *dumpencoding, + * we know how to escape strings. + */ + AH->encoding = PQclientEncoding(conn); ++ setFmtEncoding(AH->encoding); + + std_strings = PQparameterStatus(conn, "standard_conforming_strings"); + AH->std_strings = (std_strings && strcmp(std_strings, "on") == 0); +diff --git a/src/bin/pg_dump/pg_dumpall.c b/src/bin/pg_dump/pg_dumpall.c +index 27093220ab9..a44cd765c45 100644 +--- a/src/bin/pg_dump/pg_dumpall.c ++++ b/src/bin/pg_dump/pg_dumpall.c +@@ -508,6 +508,7 @@ main(int argc, char *argv[]) + * we know how to escape strings. + */ + encoding = PQclientEncoding(conn); ++ setFmtEncoding(encoding); + std_strings = PQparameterStatus(conn, "standard_conforming_strings"); + if (!std_strings) + std_strings = "off"; +diff --git a/src/bin/psql/command.c b/src/bin/psql/command.c +index 8889f833714..66e7fb4cf31 100644 +--- a/src/bin/psql/command.c ++++ b/src/bin/psql/command.c +@@ -1183,6 +1183,7 @@ exec_command_encoding(PsqlScanState scan_state, bool active_branch) + /* save encoding info into psql internal data */ + pset.encoding = PQclientEncoding(pset.db); + pset.popt.topt.encoding = pset.encoding; ++ setFmtEncoding(pset.encoding); + SetVariable(pset.vars, "ENCODING", + pg_encoding_to_char(pset.encoding)); + } +@@ -3467,6 +3468,8 @@ SyncVariables(void) + pset.popt.topt.encoding = pset.encoding; + pset.sversion = PQserverVersion(pset.db); + ++ setFmtEncoding(pset.encoding); ++ + SetVariable(pset.vars, "DBNAME", PQdb(pset.db)); + SetVariable(pset.vars, "USER", PQuser(pset.db)); + SetVariable(pset.vars, "HOST", PQhost(pset.db)); +diff --git a/src/bin/scripts/common.c b/src/bin/scripts/common.c +index 2de696c19ef..ff79421a31d 100644 +--- a/src/bin/scripts/common.c ++++ b/src/bin/scripts/common.c +@@ -376,8 +376,9 @@ appendQualifiedRelation(PQExpBuffer buf, const char *spec, + exit(1); + } + appendPQExpBufferStr(buf, +- fmtQualifiedId(PQgetvalue(res, 0, 1), +- PQgetvalue(res, 0, 0))); ++ fmtQualifiedIdEnc(PQgetvalue(res, 0, 1), ++ PQgetvalue(res, 0, 0), ++ PQclientEncoding(conn))); + appendPQExpBufferStr(buf, columns); + PQclear(res); + termPQExpBuffer(&sql); +diff --git a/src/bin/scripts/createdb.c b/src/bin/scripts/createdb.c +index b4d3e134d93..d9f55cc9f5d 100644 +--- a/src/bin/scripts/createdb.c ++++ b/src/bin/scripts/createdb.c +@@ -190,6 +190,8 @@ main(int argc, char *argv[]) + + conn = connectMaintenanceDatabase(&cparams, progname, echo); + ++ setFmtEncoding(PQclientEncoding(conn)); ++ + initPQExpBuffer(&sql); + + appendPQExpBuffer(&sql, "CREATE DATABASE %s", +diff --git a/src/bin/scripts/createuser.c b/src/bin/scripts/createuser.c +index dbc2c2a58cd..7ec8ee51be7 100644 +--- a/src/bin/scripts/createuser.c ++++ b/src/bin/scripts/createuser.c +@@ -271,6 +271,8 @@ main(int argc, char *argv[]) + + conn = connectMaintenanceDatabase(&cparams, progname, echo); + ++ setFmtEncoding(PQclientEncoding(conn)); ++ + initPQExpBuffer(&sql); + + printfPQExpBuffer(&sql, "CREATE ROLE %s", fmtId(newuser)); +diff --git a/src/bin/scripts/dropdb.c b/src/bin/scripts/dropdb.c +index ffdf12bfea7..0d636d0ef46 100644 +--- a/src/bin/scripts/dropdb.c ++++ b/src/bin/scripts/dropdb.c +@@ -125,7 +125,7 @@ main(int argc, char *argv[]) + initPQExpBuffer(&sql); + + appendPQExpBuffer(&sql, "DROP DATABASE %s%s;", +- (if_exists ? "IF EXISTS " : ""), fmtId(dbname)); ++ (if_exists ? "IF EXISTS " : ""), fmtIdEnc(dbname, PQclientEncoding(conn))); + + /* Avoid trying to drop postgres db while we are connected to it. */ + if (maintenance_db == NULL && strcmp(dbname, "postgres") == 0) +@@ -140,6 +140,12 @@ main(int argc, char *argv[]) + + conn = connectMaintenanceDatabase(&cparams, progname, echo); + ++ initPQExpBuffer(&sql); ++ appendPQExpBuffer(&sql, "DROP DATABASE %s%s%s;", ++ (if_exists ? "IF EXISTS " : ""), ++ fmtIdEnc(dbname, PQclientEncoding(conn)), ++ force ? " WITH (FORCE)" : ""); ++ + if (echo) + printf("%s\n", sql.data); + result = PQexec(conn, sql.data); +diff --git a/src/bin/scripts/dropuser.c b/src/bin/scripts/dropuser.c +index a8be6b0784b..26523f85784 100644 +--- a/src/bin/scripts/dropuser.c ++++ b/src/bin/scripts/dropuser.c +@@ -143,7 +143,8 @@ main(int argc, char *argv[]) + + initPQExpBuffer(&sql); + appendPQExpBuffer(&sql, "DROP ROLE %s%s;", +- (if_exists ? "IF EXISTS " : ""), fmtId(dropuser)); ++ (if_exists ? "IF EXISTS " : ""), ++ fmtIdEnc(dropuser, PQclientEncoding(conn))); + + if (echo) + printf("%s\n", sql.data); +diff --git a/src/bin/scripts/reindexdb.c b/src/bin/scripts/reindexdb.c +index 39b4078b411..b96d0ff54cf 100644 +--- a/src/bin/scripts/reindexdb.c ++++ b/src/bin/scripts/reindexdb.c +@@ -325,7 +325,7 @@ reindex_one_database(const ConnParams *cparams, + else if (strcmp(type, "SCHEMA") == 0) + appendPQExpBufferStr(&sql, name); + else if (strcmp(type, "DATABASE") == 0) +- appendPQExpBufferStr(&sql, fmtId(PQdb(conn))); ++ appendPQExpBufferStr(&sql, fmtIdEnc(PQdb(conn),PQclientEncoding(conn))); + appendPQExpBufferChar(&sql, ';'); + + if (!executeMaintenanceCommand(conn, sql.data, echo)) +@@ -403,7 +403,7 @@ reindex_system_catalogs(const ConnParams *cparams, + appendPQExpBufferStr(&sql, " SYSTEM "); + if (concurrently) + appendPQExpBuffer(&sql, "CONCURRENTLY "); +- appendPQExpBufferStr(&sql, fmtId(PQdb(conn))); ++ appendPQExpBufferStr(&sql, fmtIdEnc(PQdb(conn),PQclientEncoding(conn))); + appendPQExpBufferChar(&sql, ';'); + + if (!executeMaintenanceCommand(conn, sql.data, echo)) +diff --git a/src/bin/scripts/vacuumdb.c b/src/bin/scripts/vacuumdb.c +index 6ade0c31a9d..8f9ce6529dc 100644 +--- a/src/bin/scripts/vacuumdb.c ++++ b/src/bin/scripts/vacuumdb.c +@@ -602,8 +602,9 @@ vacuum_one_database(const ConnParams *cparams, + for (i = 0; i < ntups; i++) + { + appendPQExpBufferStr(&buf, +- fmtQualifiedId(PQgetvalue(res, i, 1), +- PQgetvalue(res, i, 0))); ++ fmtQualifiedIdEnc(PQgetvalue(res, i, 1), ++ PQgetvalue(res, i, 0), ++ PQclientEncoding(conn))); + + if (tables_listed && !PQgetisnull(res, i, 2)) + appendPQExpBufferStr(&buf, PQgetvalue(res, i, 2)); +diff --git a/src/fe_utils/string_utils.c b/src/fe_utils/string_utils.c +index d5757becef2..05f0bd2576d 100644 +--- a/src/fe_utils/string_utils.c ++++ b/src/fe_utils/string_utils.c +@@ -18,6 +18,7 @@ + #include + + #include "fe_utils/string_utils.h" ++#include "mb/pg_wchar.h" + + #include "common/keywords.h" + +@@ -28,6 +29,8 @@ static PQExpBuffer defaultGetLocalPQExpBuffer(void); + int quote_all_identifiers = 0; + PQExpBuffer (*getLocalPQExpBuffer) (void) = defaultGetLocalPQExpBuffer; + ++static int fmtIdEncoding = -1; ++ + + /* + * Returns a temporary PQExpBuffer, valid until the next call to the function. +@@ -56,14 +59,48 @@ defaultGetLocalPQExpBuffer(void) + return id_return; + } + ++/* ++ * Set the encoding that fmtId() and fmtQualifiedId() use. ++ * ++ * This is not safe against multiple connections having different encodings, ++ * but there is no real other way to address the need to know the encoding for ++ * fmtId()/fmtQualifiedId() input for safe escaping. Eventually we should get ++ * rid of fmtId(). ++ */ ++void ++setFmtEncoding(int encoding) ++{ ++ fmtIdEncoding = encoding; ++} ++ ++/* ++ * Return the currently configured encoding for fmtId() and fmtQualifiedId(). ++ */ ++static int ++getFmtEncoding(void) ++{ ++ if (fmtIdEncoding != -1) ++ return fmtIdEncoding; ++ ++ /* ++ * In assertion builds it seems best to fail hard if the encoding was not ++ * set, to make it easier to find places with missing calls. But in ++ * production builds that seems like a bad idea, thus we instead just ++ * default to UTF-8. ++ */ ++ Assert(fmtIdEncoding != -1); ++ ++ return PG_UTF8; ++} ++ + /* + * Quotes input string if it's not a legitimate SQL identifier as-is. + * +- * Note that the returned string must be used before calling fmtId again, ++ * Note that the returned string must be used before calling fmtIdEnc again, + * since we re-use the same return buffer each time. + */ + const char * +-fmtId(const char *rawid) ++fmtIdEnc(const char *rawid, int encoding) + { + PQExpBuffer id_return = getLocalPQExpBuffer(); + +@@ -136,7 +173,24 @@ fmtId(const char *rawid) + } + + /* +- * fmtQualifiedId - construct a schema-qualified name, with quoting as needed. ++ * Quotes input string if it's not a legitimate SQL identifier as-is. ++ * ++ * Note that the returned string must be used before calling fmtId again, ++ * since we re-use the same return buffer each time. ++ * ++ * NB: This assumes setFmtEncoding() previously has been called to configure ++ * the encoding of rawid. It is preferable to use fmtIdEnc() with an ++ * explicit encoding. ++ */ ++const char * ++fmtId(const char *rawid) ++{ ++ return fmtIdEnc(rawid, getFmtEncoding()); ++} ++ ++/* ++ * fmtQualifiedIdEnc - construct a schema-qualified name, with quoting as ++ * needed. + * + * Like fmtId, use the result before calling again. + * +@@ -144,7 +198,7 @@ fmtId(const char *rawid) + * use that buffer until we're finished with calling fmtId(). + */ + const char * +-fmtQualifiedId(const char *schema, const char *id) ++fmtQualifiedIdEnc(const char *schema, const char *id, int encoding) + { + PQExpBuffer id_return; + PQExpBuffer lcl_pqexp = createPQExpBuffer(); +@@ -152,9 +206,9 @@ fmtQualifiedId(const char *schema, const char *id) + /* Some callers might fail to provide a schema name */ + if (schema && *schema) + { +- appendPQExpBuffer(lcl_pqexp, "%s.", fmtId(schema)); ++ appendPQExpBuffer(lcl_pqexp, "%s.", fmtIdEnc(schema, encoding)); + } +- appendPQExpBufferStr(lcl_pqexp, fmtId(id)); ++ appendPQExpBufferStr(lcl_pqexp, fmtIdEnc(id, encoding)); + + id_return = getLocalPQExpBuffer(); + +@@ -164,6 +218,24 @@ fmtQualifiedId(const char *schema, const char *id) + return id_return->data; + } + ++/* ++ * fmtQualifiedId - construct a schema-qualified name, with quoting as needed. ++ * ++ * Like fmtId, use the result before calling again. ++ * ++ * Since we call fmtId and it also uses getLocalPQExpBuffer() we cannot ++ * use that buffer until we're finished with calling fmtId(). ++ * ++ * NB: This assumes setFmtEncoding() previously has been called to configure ++ * the encoding of schema/id. It is preferable to use fmtQualifiedIdEnc() ++ * with an explicit encoding. ++ */ ++const char * ++fmtQualifiedId(const char *schema, const char *id) ++{ ++ return fmtQualifiedIdEnc(schema, id, getFmtEncoding()); ++} ++ + + /* + * Format a Postgres version number (in the PG_VERSION_NUM integer format +diff --git a/src/include/fe_utils/string_utils.h b/src/include/fe_utils/string_utils.h +index 8c13cc0a66d..37f17f0b370 100644 +--- a/src/include/fe_utils/string_utils.h ++++ b/src/include/fe_utils/string_utils.h +@@ -24,8 +24,11 @@ extern int quote_all_identifiers; + extern PQExpBuffer (*getLocalPQExpBuffer) (void); + + /* Functions */ +-extern const char *fmtId(const char *identifier); ++extern const char *fmtId(const char *rawid); ++extern const char *fmtIdEnc(const char *rawid, int encoding); + extern const char *fmtQualifiedId(const char *schema, const char *id); ++extern const char *fmtQualifiedIdEnc(const char *schema, const char *id, int encoding); ++extern void setFmtEncoding(int encoding); + + extern char *formatPGVersionNumber(int version_number, bool include_minor, + char *buf, size_t buflen); +-- +2.39.5 (Apple Git-154) + + +From 7c56df18c1f6e48c4343f2d6d1364c5825e45278 Mon Sep 17 00:00:00 2001 +From: Andres Freund +Date: Mon, 10 Feb 2025 10:03:37 -0500 +Subject: [PATCH 3/8] Backport upstream commit: + 5dc1e42b4fa6a4434afa7d7cdcf0291351a7b873 Fix handling of invalidly encoded + data in escaping functions + +Previously invalidly encoded input to various escaping functions could lead to +the escaped string getting incorrectly parsed by psql. To be safe, escaping +functions need to ensure that neither invalid nor incomplete multi-byte +characters can be used to "escape" from being quoted. + +Functions which can report errors now return an error in more cases than +before. Functions that cannot report errors now replace invalid input bytes +with a byte sequence that cannot be used to escape the quotes and that is +guaranteed to error out when a query is sent to the server. + +The following functions are fixed by this commit: +- PQescapeLiteral() +- PQescapeIdentifier() +- PQescapeString() +- PQescapeStringConn() +- fmtId() +- appendStringLiteral() +--- + src/fe_utils/string_utils.c | 170 ++++++++++++++++++++++++++------- + src/interfaces/libpq/fe-exec.c | 114 ++++++++++++++-------- + 2 files changed, 212 insertions(+), 72 deletions(-) + +diff --git a/src/fe_utils/string_utils.c b/src/fe_utils/string_utils.c +index 05f0bd2576d..9f7151bd542 100644 +--- a/src/fe_utils/string_utils.c ++++ b/src/fe_utils/string_utils.c +@@ -106,6 +106,7 @@ fmtIdEnc(const char *rawid, int encoding) + + const char *cp; + bool need_quotes = false; ++ size_t remaining = strlen(rawid); + + /* + * These checks need to match the identifier production in scan.l. Don't +@@ -119,7 +120,8 @@ fmtIdEnc(const char *rawid, int encoding) + else + { + /* otherwise check the entire string */ +- for (cp = rawid; *cp; cp++) ++ cp = rawid; ++ for (size_t i = 0; i < remaining; i++, cp++) + { + if (!((*cp >= 'a' && *cp <= 'z') + || (*cp >= '0' && *cp <= '9') +@@ -155,17 +157,90 @@ fmtIdEnc(const char *rawid, int encoding) + else + { + appendPQExpBufferChar(id_return, '"'); +- for (cp = rawid; *cp; cp++) ++ ++ cp = &rawid[0]; ++ while (remaining > 0) + { +- /* +- * Did we find a double-quote in the string? Then make this a +- * double double-quote per SQL99. Before, we put in a +- * backslash/double-quote pair. - thomas 2000-08-05 +- */ +- if (*cp == '"') +- appendPQExpBufferChar(id_return, '"'); +- appendPQExpBufferChar(id_return, *cp); ++ int charlen; ++ ++ /* Fast path for plain ASCII */ ++ if (!IS_HIGHBIT_SET(*cp)) ++ { ++ /* ++ * Did we find a double-quote in the string? Then make this a ++ * double double-quote per SQL99. Before, we put in a ++ * backslash/double-quote pair. - thomas 2000-08-05 ++ */ ++ if (*cp == '"') ++ appendPQExpBufferChar(id_return, '"'); ++ appendPQExpBufferChar(id_return, *cp); ++ remaining--; ++ cp++; ++ continue; ++ } ++ ++ /* Slow path for possible multibyte characters */ ++ charlen = pg_encoding_mblen(encoding, cp); ++ ++ if (remaining < charlen) ++ { ++ /* ++ * If the character is longer than the available input, ++ * replace the string with an invalid sequence. The invalid ++ * sequence ensures that the escaped string will trigger an ++ * error on the server-side, even if we can't directly report ++ * an error here. ++ */ ++ enlargePQExpBuffer(id_return, 2); ++ pg_encoding_set_invalid(encoding, ++ id_return->data + id_return->len); ++ id_return->len += 2; ++ id_return->data[id_return->len] = '\0'; ++ ++ /* there's no more input data, so we can stop */ ++ break; ++ } ++ else if (pg_encoding_verifymbchar(encoding, cp, charlen) == -1) ++ { ++ /* ++ * Multibyte character is invalid. It's important to verify ++ * that as invalid multi-byte characters could e.g. be used to ++ * "skip" over quote characters, e.g. when parsing ++ * character-by-character. ++ * ++ * Replace the bytes corresponding to the invalid character ++ * with an invalid sequence, for the same reason as above. ++ * ++ * It would be a bit faster to verify the whole string the ++ * first time we encounter a set highbit, but this way we can ++ * replace just the invalid characters, which probably makes ++ * it easier for users to find the invalidly encoded portion ++ * of a larger string. ++ */ ++ enlargePQExpBuffer(id_return, 2); ++ pg_encoding_set_invalid(encoding, ++ id_return->data + id_return->len); ++ id_return->len += 2; ++ id_return->data[id_return->len] = '\0'; ++ ++ /* ++ * Copy the rest of the string after the invalid multi-byte ++ * character. ++ */ ++ remaining -= charlen; ++ cp += charlen; ++ } ++ else ++ { ++ for (int i = 0; i < charlen; i++) ++ { ++ appendPQExpBufferChar(id_return, *cp); ++ remaining--; ++ cp++; ++ } ++ } + } ++ + appendPQExpBufferChar(id_return, '"'); + } + +@@ -292,6 +367,7 @@ appendStringLiteral(PQExpBuffer buf, const char *str, + size_t length = strlen(str); + const char *source = str; + char *target; ++ size_t remaining = length; + + if (!enlargePQExpBuffer(buf, 2 * length + 2)) + return; +@@ -299,10 +375,10 @@ appendStringLiteral(PQExpBuffer buf, const char *str, + target = buf->data + buf->len; + *target++ = '\''; + +- while (*source != '\0') ++ while (remaining > 0) + { + char c = *source; +- int len; ++ int charlen; + int i; + + /* Fast path for plain ASCII */ +@@ -314,39 +390,65 @@ appendStringLiteral(PQExpBuffer buf, const char *str, + /* Copy the character */ + *target++ = c; + source++; ++ remaining--; + continue; + } + + /* Slow path for possible multibyte characters */ +- len = PQmblen(source, encoding); ++ charlen = PQmblen(source, encoding); + +- /* Copy the character */ +- for (i = 0; i < len; i++) ++ if (remaining < charlen) + { +- if (*source == '\0') +- break; +- *target++ = *source++; +- } ++ /* ++ * If the character is longer than the available input, replace ++ * the string with an invalid sequence. The invalid sequence ++ * ensures that the escaped string will trigger an error on the ++ * server-side, even if we can't directly report an error here. ++ * ++ * We know there's enough space for the invalid sequence because ++ * the "target" buffer is 2 * length + 2 long, and at worst we're ++ * replacing a single input byte with two invalid bytes. ++ */ ++ pg_encoding_set_invalid(encoding, target); ++ target += 2; + +- /* +- * If we hit premature end of string (ie, incomplete multibyte +- * character), try to pad out to the correct length with spaces. We +- * may not be able to pad completely, but we will always be able to +- * insert at least one pad space (since we'd not have quoted a +- * multibyte character). This should be enough to make a string that +- * the server will error out on. +- */ +- if (i < len) ++ /* there's no more valid input data, so we can stop */ ++ break; ++ } ++ else if (pg_encoding_verifymbchar(encoding, source, charlen) == -1) + { +- char *stop = buf->data + buf->maxlen - 2; ++ /* ++ * Multibyte character is invalid. It's important to verify that ++ * as invalid multi-byte characters could e.g. be used to "skip" ++ * over quote characters, e.g. when parsing ++ * character-by-character. ++ * ++ * Replace the bytes corresponding to the invalid character with ++ * an invalid sequence, for the same reason as above. ++ * ++ * It would be a bit faster to verify the whole string the first ++ * time we encounter a set highbit, but this way we can replace ++ * just the invalid characters, which probably makes it easier for ++ * users to find the invalidly encoded portion of a larger string. ++ */ ++ pg_encoding_set_invalid(encoding, target); ++ target += 2; ++ remaining -= charlen; + +- for (; i < len; i++) ++ /* ++ * Copy the rest of the string after the invalid multi-byte ++ * character. ++ */ ++ source += charlen; ++ } ++ else ++ { ++ /* Copy the character */ ++ for (i = 0; i < charlen; i++) + { +- if (target >= stop) +- break; +- *target++ = ' '; ++ *target++ = *source++; ++ remaining--; + } +- break; + } + } + +diff --git a/src/interfaces/libpq/fe-exec.c b/src/interfaces/libpq/fe-exec.c +index ff101c4ca2a..97cd2c53673 100644 +--- a/src/interfaces/libpq/fe-exec.c ++++ b/src/interfaces/libpq/fe-exec.c +@@ -3348,15 +3348,15 @@ PQescapeStringInternal(PGconn *conn, + { + const char *source = from; + char *target = to; +- size_t remaining = length; ++ size_t remaining = strnlen(from, length); + + if (error) + *error = 0; + +- while (remaining > 0 && *source != '\0') ++ while (remaining > 0) + { + char c = *source; +- int len; ++ int charlen; + int i; + + /* Fast path for plain ASCII */ +@@ -3373,39 +3373,48 @@ PQescapeStringInternal(PGconn *conn, + } + + /* Slow path for possible multibyte characters */ +- len = pg_encoding_mblen(encoding, source); ++ charlen = pg_encoding_mblen(encoding, source); + +- /* Copy the character */ +- for (i = 0; i < len; i++) +- { +- if (remaining == 0 || *source == '\0') +- break; +- *target++ = *source++; +- remaining--; +- } +- +- /* +- * If we hit premature end of string (ie, incomplete multibyte +- * character), try to pad out to the correct length with spaces. We +- * may not be able to pad completely, but we will always be able to +- * insert at least one pad space (since we'd not have quoted a +- * multibyte character). This should be enough to make a string that +- * the server will error out on. +- */ +- if (i < len) ++ if (remaining < charlen || ++ pg_encoding_verifymbchar(encoding, source, charlen) == -1) + { ++ /* ++ * If the character is longer than the available input, report an ++ * error if possible, and replace the string with an invalid ++ * sequence. The invalid sequence ensures that the escaped string ++ * will trigger an error on the server-side, even if we can't ++ * directly report an error here. ++ * ++ * This isn't *that* crucial when we can report an error to the ++ * caller, but if we can't, the caller will use this string ++ * unmodified and it needs to be safe for parsing. ++ * ++ * We know there's enough space for the invalid sequence because ++ * the "to" buffer needs to be at least 2 * length + 1 long, and ++ * at worst we're replacing a single input byte with two invalid ++ * bytes. ++ */ + if (error) + *error = 1; + if (conn) +- printfPQExpBuffer(&conn->errorMessage, +- libpq_gettext("incomplete multibyte character\n")); +- for (; i < len; i++) ++ libpq_append_conn_error(conn, "incomplete multibyte character"); ++ ++ pg_encoding_set_invalid(encoding, target); ++ target += 2; ++ source++; ++ remaining--; ++ ++ /* there's no more input data, so we can stop */ ++ break; ++ } ++ else ++ { ++ /* Copy the character */ ++ for (i = 0; i < charlen; i++) + { +- if (((size_t) (target - to)) / 2 >= length) +- break; +- *target++ = ' '; ++ *target++ = *source++; ++ remaining--; + } +- break; + } + } + +@@ -3451,21 +3460,27 @@ PQescapeString(char *to, const char *from, size_t length) + static char * + PQescapeInternal(PGconn *conn, const char *str, size_t len, bool as_ident) + { +- const char *s; ++ const char *s; + char *result; + char *rp; + int num_quotes = 0; /* single or double, depending on as_ident */ + int num_backslashes = 0; +- int input_len; +- int result_size; ++ size_t input_len = strnlen(str, len); ++ size_t result_size; + char quote_char = as_ident ? '"' : '\''; ++ bool validated_mb = false; + + /* We must have a connection, else fail immediately. */ + if (!conn) + return NULL; + +- /* Scan the string for characters that must be escaped. */ +- for (s = str; (s - str) < len && *s != '\0'; ++s) ++ ++ /* ++ * Scan the string for characters that must be escaped and for invalidly ++ * encoded data. ++ */ ++ s = str; ++ for (size_t remaining = input_len; remaining > 0; remaining--, s++) + { + if (*s == quote_char) + ++num_quotes; +@@ -3478,21 +3493,42 @@ PQescapeInternal(PGconn *conn, const char *str, size_t len, bool as_ident) + /* Slow path for possible multibyte characters */ + charlen = pg_encoding_mblen(conn->client_encoding, s); + +- /* Multibyte character overruns allowable length. */ +- if ((s - str) + charlen > len || memchr(s, 0, charlen) != NULL) ++ if (charlen > remaining) + { + printfPQExpBuffer(&conn->errorMessage, + libpq_gettext("incomplete multibyte character\n")); + return NULL; + } + ++ /* ++ * If we haven't already, check that multibyte characters are ++ * valid. It's important to verify that as invalid multi-byte ++ * characters could e.g. be used to "skip" over quote characters, ++ * e.g. when parsing character-by-character. ++ * ++ * We check validity once, for the whole remainder of the string, ++ * when we first encounter any multi-byte character. Some ++ * encodings have optimized implementations for longer strings. ++ */ ++ if (!validated_mb) ++ { ++ if (pg_encoding_verifymbstr(conn->client_encoding, s, remaining) ++ != remaining) ++ { ++ printfPQExpBuffer(&conn->errorMessage, ++ libpq_gettext("invalid multibyte character\n")); ++ return NULL; ++ } ++ validated_mb = true; ++ } ++ + /* Adjust s, bearing in mind that for loop will increment it. */ + s += charlen - 1; ++ remaining -= charlen - 1; + } + } + + /* Allocate output buffer. */ +- input_len = s - str; + result_size = input_len + num_quotes + 3; /* two quotes, plus a NUL */ + if (!as_ident && num_backslashes > 0) + result_size += num_backslashes + 2; +@@ -3538,7 +3574,8 @@ PQescapeInternal(PGconn *conn, const char *str, size_t len, bool as_ident) + } + else + { +- for (s = str; s - str < input_len; ++s) ++ s = str; ++ for (size_t remaining = input_len; remaining > 0; remaining--, s++) + { + if (*s == quote_char || (!as_ident && *s == '\\')) + { +@@ -3556,6 +3593,7 @@ PQescapeInternal(PGconn *conn, const char *str, size_t len, bool as_ident) + *rp++ = *s; + if (--i == 0) + break; ++ remaining--; + ++s; /* for loop will provide the final increment */ + } + } +-- +2.39.5 (Apple Git-154) + + +From 3751ccde18122412fcbfcc2df583cf66fefdbab0 Mon Sep 17 00:00:00 2001 +From: Tom Lane +Date: Mon, 10 Feb 2025 16:30:03 -0500 +Subject: [PATCH 4/8] Backport upstream commit + 5bf12323b6b8b05790aab6876555568898f4fc81 Adapt appendPsqlMetaConnect() to the + new fmtId() encoding expectations. + +We need to tell fmtId() what encoding to assume, but this function +doesn't know that. Fortunately we can fix that without changing the +function's API, because we can just use SQL_ASCII. That's because +database names in connection requests are effectively binary not text: +no encoding-aware processing will happen on them. + +This fixes XversionUpgrade failures seen in the buildfarm. The +alternative of having pg_upgrade use setFmtEncoding() is unappetizing, +given that it's connecting to multiple databases that may have +different encodings. + +Andres Freund, Noah Misch, Tom Lane + +Security: CVE-2025-1094 +--- + src/fe_utils/string_utils.c | 21 +++++++++++++++------ + 1 file changed, 15 insertions(+), 6 deletions(-) + +diff --git a/src/fe_utils/string_utils.c b/src/fe_utils/string_utils.c +index 9f7151bd542..a289d3001eb 100644 +--- a/src/fe_utils/string_utils.c ++++ b/src/fe_utils/string_utils.c +@@ -792,16 +792,22 @@ appendPsqlMetaConnect(PQExpBuffer buf, const char *dbname) + } + } + +- appendPQExpBufferStr(buf, "\\connect "); + if (complex) + { + PQExpBufferData connstr; + + initPQExpBuffer(&connstr); +- appendPQExpBuffer(&connstr, "dbname="); +- appendConnStrVal(&connstr, dbname); + +- appendPQExpBuffer(buf, "-reuse-previous=on "); ++ /* ++ * Force the target psql's encoding to SQL_ASCII. We don't really ++ * know the encoding of the database name, and it doesn't matter as ++ * long as psql will forward it to the server unchanged. ++ */ ++ appendPQExpBufferStr(buf, "\\encoding SQL_ASCII\n"); ++ appendPQExpBufferStr(buf, "\\connect -reuse-previous=on "); ++ ++ appendPQExpBufferStr(&connstr, "dbname="); ++ appendConnStrVal(&connstr, dbname); + + /* + * As long as the name does not contain a newline, SQL identifier +@@ -809,12 +815,15 @@ appendPsqlMetaConnect(PQExpBuffer buf, const char *dbname) + * involve psql-interpreted single quotes, which behaved differently + * before PostgreSQL 9.2. + */ +- appendPQExpBufferStr(buf, fmtId(connstr.data)); ++ appendPQExpBufferStr(buf, fmtIdEnc(connstr.data, PG_SQL_ASCII)); + + termPQExpBuffer(&connstr); + } + else +- appendPQExpBufferStr(buf, fmtId(dbname)); ++ { ++ appendPQExpBufferStr(buf, "\\connect "); ++ appendPQExpBufferStr(buf, fmtIdEnc(dbname, PG_SQL_ASCII)); ++ } + appendPQExpBufferChar(buf, '\n'); + } + +-- +2.39.5 (Apple Git-154) + + +From 84b7b93568fa4523afb66d2d1776f5e24b5db1de Mon Sep 17 00:00:00 2001 +From: Tom Lane +Date: Sat, 15 Feb 2025 16:20:21 -0500 +Subject: [PATCH 5/8] Backport upstream commit: + 9f45e6a91d8460ac0b1f30e6ae3eefb185b8d0ab Make escaping functions retain + trailing bytes of an invalid character. + +Instead of dropping the trailing byte(s) of an invalid or incomplete +multibyte character, replace only the first byte with a known-invalid +sequence, and process the rest normally. This seems less likely to +confuse incautious callers than the behavior adopted in 5dc1e42b4. + +While we're at it, adjust PQescapeStringInternal to produce at most +one bleat about invalid multibyte characters per string. This +matches the behavior of PQescapeInternal, and avoids the risk of +producing tons of repetitive junk if a long string is simply given +in the wrong encoding. + +This is a followup to the fixes for CVE-2025-1094, and should be +included if cherry-picking those fixes. + +Author: Andres Freund +Co-authored-by: Tom Lane +Reported-by: Jeff Davis +Discussion: https://postgr.es/m/20250215012712.45@rfd.leadboat.com +--- + src/fe_utils/string_utils.c | 91 +++++++++++++--------------------- + src/interfaces/libpq/fe-exec.c | 22 ++++---- + 2 files changed, 47 insertions(+), 66 deletions(-) + +diff --git a/src/fe_utils/string_utils.c b/src/fe_utils/string_utils.c +index a289d3001eb..a2d5ccd1e28 100644 +--- a/src/fe_utils/string_utils.c ++++ b/src/fe_utils/string_utils.c +@@ -182,40 +182,25 @@ fmtIdEnc(const char *rawid, int encoding) + /* Slow path for possible multibyte characters */ + charlen = pg_encoding_mblen(encoding, cp); + +- if (remaining < charlen) +- { +- /* +- * If the character is longer than the available input, +- * replace the string with an invalid sequence. The invalid +- * sequence ensures that the escaped string will trigger an +- * error on the server-side, even if we can't directly report +- * an error here. +- */ +- enlargePQExpBuffer(id_return, 2); +- pg_encoding_set_invalid(encoding, +- id_return->data + id_return->len); +- id_return->len += 2; +- id_return->data[id_return->len] = '\0'; +- +- /* there's no more input data, so we can stop */ +- break; +- } +- else if (pg_encoding_verifymbchar(encoding, cp, charlen) == -1) ++ if (remaining < charlen || ++ pg_encoding_verifymbchar(encoding, cp, charlen) == -1) + { + /* + * Multibyte character is invalid. It's important to verify +- * that as invalid multi-byte characters could e.g. be used to ++ * that as invalid multibyte characters could e.g. be used to + * "skip" over quote characters, e.g. when parsing + * character-by-character. + * +- * Replace the bytes corresponding to the invalid character +- * with an invalid sequence, for the same reason as above. ++ * Replace the character's first byte with an invalid ++ * sequence. The invalid sequence ensures that the escaped ++ * string will trigger an error on the server-side, even if we ++ * can't directly report an error here. + * + * It would be a bit faster to verify the whole string the + * first time we encounter a set highbit, but this way we can +- * replace just the invalid characters, which probably makes +- * it easier for users to find the invalidly encoded portion +- * of a larger string. ++ * replace just the invalid data, which probably makes it ++ * easier for users to find the invalidly encoded portion of a ++ * larger string. + */ + enlargePQExpBuffer(id_return, 2); + pg_encoding_set_invalid(encoding, +@@ -224,11 +209,13 @@ fmtIdEnc(const char *rawid, int encoding) + id_return->data[id_return->len] = '\0'; + + /* +- * Copy the rest of the string after the invalid multi-byte +- * character. ++ * Handle the following bytes as if this byte didn't exist. ++ * That's safer in case the subsequent bytes contain ++ * characters that are significant for the caller (e.g. '>' in ++ * html). + */ +- remaining -= charlen; +- cp += charlen; ++ remaining--; ++ cp++; + } + else + { +@@ -397,49 +384,39 @@ appendStringLiteral(PQExpBuffer buf, const char *str, + /* Slow path for possible multibyte characters */ + charlen = PQmblen(source, encoding); + +- if (remaining < charlen) +- { +- /* +- * If the character is longer than the available input, replace +- * the string with an invalid sequence. The invalid sequence +- * ensures that the escaped string will trigger an error on the +- * server-side, even if we can't directly report an error here. +- * +- * We know there's enough space for the invalid sequence because +- * the "target" buffer is 2 * length + 2 long, and at worst we're +- * replacing a single input byte with two invalid bytes. +- */ +- pg_encoding_set_invalid(encoding, target); +- target += 2; +- +- /* there's no more valid input data, so we can stop */ +- break; +- } +- else if (pg_encoding_verifymbchar(encoding, source, charlen) == -1) ++ if (remaining < charlen || ++ pg_encoding_verifymbchar(encoding, source, charlen) == -1) + { + /* + * Multibyte character is invalid. It's important to verify that +- * as invalid multi-byte characters could e.g. be used to "skip" ++ * as invalid multibyte characters could e.g. be used to "skip" + * over quote characters, e.g. when parsing + * character-by-character. + * +- * Replace the bytes corresponding to the invalid character with +- * an invalid sequence, for the same reason as above. ++ * Replace the character's first byte with an invalid sequence. ++ * The invalid sequence ensures that the escaped string will ++ * trigger an error on the server-side, even if we can't directly ++ * report an error here. ++ * ++ * We know there's enough space for the invalid sequence because ++ * the "target" buffer is 2 * length + 2 long, and at worst we're ++ * replacing a single input byte with two invalid bytes. + * + * It would be a bit faster to verify the whole string the first + * time we encounter a set highbit, but this way we can replace +- * just the invalid characters, which probably makes it easier for +- * users to find the invalidly encoded portion of a larger string. ++ * just the invalid data, which probably makes it easier for users ++ * to find the invalidly encoded portion of a larger string. + */ + pg_encoding_set_invalid(encoding, target); + target += 2; +- remaining -= charlen; + + /* +- * Copy the rest of the string after the invalid multi-byte +- * character. ++ * Handle the following bytes as if this byte didn't exist. That's ++ * safer in case the subsequent bytes contain important characters ++ * for the caller (e.g. '>' in html). + */ +- source += charlen; ++ source++; ++ remaining--; + } + else + { +diff --git a/src/interfaces/libpq/fe-exec.c b/src/interfaces/libpq/fe-exec.c +index 97cd2c53673..a29d19a6268 100644 +--- a/src/interfaces/libpq/fe-exec.c ++++ b/src/interfaces/libpq/fe-exec.c +@@ -3349,6 +3349,7 @@ PQescapeStringInternal(PGconn *conn, + const char *source = from; + char *target = to; + size_t remaining = strnlen(from, length); ++ bool already_complained = false; + + if (error) + *error = 0; +@@ -3379,15 +3380,20 @@ PQescapeStringInternal(PGconn *conn, + pg_encoding_verifymbchar(encoding, source, charlen) == -1) + { + /* +- * If the character is longer than the available input, report an +- * error if possible, and replace the string with an invalid +- * sequence. The invalid sequence ensures that the escaped string +- * will trigger an error on the server-side, even if we can't +- * directly report an error here. ++ * Multibyte character is invalid. It's important to verify that ++ * as invalid multibyte characters could e.g. be used to "skip" ++ * over quote characters, e.g. when parsing ++ * character-by-character. ++ * ++ * Report an error if possible, and replace the character's first ++ * byte with an invalid sequence. The invalid sequence ensures ++ * that the escaped string will trigger an error on the ++ * server-side, even if we can't directly report an error here. + * + * This isn't *that* crucial when we can report an error to the +- * caller, but if we can't, the caller will use this string +- * unmodified and it needs to be safe for parsing. ++ * caller; but if we can't or the caller ignores it, the caller ++ * will use this string unmodified and it needs to be safe for ++ * parsing. + * + * We know there's enough space for the invalid sequence because + * the "to" buffer needs to be at least 2 * length + 1 long, and +@@ -3404,8 +3410,6 @@ PQescapeStringInternal(PGconn *conn, + source++; + remaining--; + +- /* there's no more input data, so we can stop */ +- break; + } + else + { +-- +2.39.5 (Apple Git-154) + + +From 21118244dad366d20e1d11549df03dd56e76dbaa Mon Sep 17 00:00:00 2001 +From: Tom Lane +Date: Sun, 16 Feb 2025 12:46:35 -0500 +Subject: [PATCH 6/8] In fmtIdEnc(), handle failure of enlargePQExpBuffer(). + +Coverity complained that we weren't doing that, and it's right. + +This fix just makes fmtIdEnc() honor the general convention that OOM +causes a PQExpBuffer to become marked "broken", without any immediate +error. In the pretty-unlikely case that we actually did hit OOM here, +the end result would be to return an empty string to the caller, +probably resulting in invalid SQL syntax in an issued command (if +nothing else went wrong, which is even more unlikely). It's tempting +to throw an "out of memory" error if the buffer becomes broken, but +there's not a lot of point in doing that only here and not in hundreds +of other PQExpBuffer-using places in pg_dump and similar callers. +The whole issue could do with some non-time-crunched redesign, perhaps. + +This is a followup to the fixes for CVE-2025-1094, and should be +included if cherry-picking those fixes. +--- + src/fe_utils/string_utils.c | 12 +++++++----- + 1 file changed, 7 insertions(+), 5 deletions(-) + +diff --git a/src/fe_utils/string_utils.c b/src/fe_utils/string_utils.c +index a2d5ccd1e28..fe280df3c0f 100644 +--- a/src/fe_utils/string_utils.c ++++ b/src/fe_utils/string_utils.c +@@ -202,11 +202,13 @@ fmtIdEnc(const char *rawid, int encoding) + * easier for users to find the invalidly encoded portion of a + * larger string. + */ +- enlargePQExpBuffer(id_return, 2); +- pg_encoding_set_invalid(encoding, +- id_return->data + id_return->len); +- id_return->len += 2; +- id_return->data[id_return->len] = '\0'; ++ if (enlargePQExpBuffer(id_return, 2)) ++ { ++ pg_encoding_set_invalid(encoding, ++ id_return->data + id_return->len); ++ id_return->len += 2; ++ id_return->data[id_return->len] = '\0'; ++ } + + /* + * Handle the following bytes as if this byte didn't exist. +-- +2.39.5 (Apple Git-154) + + +From 6f42371a3c3911299c081afe3478022c496b07a9 Mon Sep 17 00:00:00 2001 +From: Filip Janus +Date: Mon, 17 Mar 2025 18:14:05 +0100 +Subject: [PATCH 7/8] Backport multiple changes from postgresql13, especially + wchar.c functionality from backend was moved to common directory, it means + that functionaity can be used by server but also by libpq. Due to the + necessary changes there are couple of "reverts" from previous commits in + src/backend/utils/mb/wchar.c but it's expected because now it's linked with + implementation from common/wchar.c instead src/backend/utils/mb/wchar.c + +--- + src/backend/utils/mb/wchar.c | 101 +- + src/bin/scripts/dropdb.c | 5 +- + src/common/Makefile | 2 +- + src/common/wchar.c | 1728 ++++++++++++++++++ + src/include/common/unicode_combining_table.h | 196 ++ + src/include/mb/pg_wchar.h | 4 + + src/interfaces/libpq/fe-exec.c | 7 +- + 7 files changed, 1958 insertions(+), 85 deletions(-) + create mode 100644 src/common/wchar.c + create mode 100644 src/include/common/unicode_combining_table.h + +diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c +index 872241cc804..1ca6094d2a3 100644 +--- a/src/backend/utils/mb/wchar.c ++++ b/src/backend/utils/mb/wchar.c +@@ -14,25 +14,6 @@ + #include "mb/pg_wchar.h" + + +-/* +- * In today's multibyte encodings other than UTF8, this two-byte sequence +- * ensures pg_encoding_mblen() == 2 && pg_encoding_verifymbstr() == 0. +- * +- * For historical reasons, several verifychar implementations opt to reject +- * this pair specifically. Byte pair range constraints, in encoding +- * originator documentation, always excluded this pair. No core conversion +- * could translate it. However, longstanding verifychar implementations +- * accepted any non-NUL byte. big5_to_euc_tw and big5_to_mic even translate +- * pairs not valid per encoding originator documentation. To avoid tightening +- * core or non-core conversions in a security patch, we sought this one pair. +- * +- * PQescapeString() historically used spaces for BYTE1; many other values +- * could suffice for BYTE1. +- */ +-#define NONUTF8_INVALID_BYTE0 (0x8d) +-#define NONUTF8_INVALID_BYTE1 (' ') +- +- + /* + * Operations on multi-byte encodings are driven by a table of helper + * functions. +@@ -496,7 +477,7 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) + /* + * Map a Unicode code point to UTF-8. utf8string must have 4 bytes of + * space allocated. +- */ ++ * + unsigned char * + unicode_to_utf8(pg_wchar c, unsigned char *utf8string) + { +@@ -525,7 +506,7 @@ unicode_to_utf8(pg_wchar c, unsigned char *utf8string) + + return utf8string; + } +- ++*/ + /* + * Trivial conversion from pg_wchar to UTF-8. + * caller should allocate enough space for "to" +@@ -562,7 +543,7 @@ pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len) + * + * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps + * other places would need to be fixed to change this. +- */ ++ * + int + pg_utf_mblen(const unsigned char *s) + { +@@ -586,7 +567,7 @@ pg_utf_mblen(const unsigned char *s) + len = 1; + return len; + } +- ++*/ + /* + * This is an implementation of wcwidth() and wcswidth() as defined in + * "The Single UNIX Specification, Version 2, The Open Group, 1997" +@@ -765,7 +746,7 @@ ucs_wcwidth(pg_wchar ucs) + * This is a one-character version of pg_utf2wchar_with_len. + * + * No error checks here, c must point to a long-enough string. +- */ ++ * + pg_wchar + utf8_to_unicode(const unsigned char *c) + { +@@ -784,10 +765,10 @@ utf8_to_unicode(const unsigned char *c) + ((c[2] & 0x3f) << 6) | + (c[3] & 0x3f)); + else +- /* that is an invalid code on purpose */ ++ // that is an invalid code on purpose + return 0xffffffff; + } +- ++*/ + static int + pg_utf_dsplen(const unsigned char *s) + { +@@ -917,7 +898,7 @@ pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len) + *to = 0; + return cnt; + } +- ++/* + int + pg_mule_mblen(const unsigned char *s) + { +@@ -932,9 +913,9 @@ pg_mule_mblen(const unsigned char *s) + else if (IS_LCPRV2(*s)) + len = 4; + else +- len = 1; /* assume ASCII */ ++ len = 1; + return len; +-} ++}*/ + + static int + pg_mule_dsplen(const unsigned char *s) +@@ -1413,11 +1394,6 @@ pg_big5_verifier(const unsigned char *s, int len) + if (len < l) + return -1; + +- if (l == 2 && +- s[0] == NONUTF8_INVALID_BYTE0 && +- s[1] == NONUTF8_INVALID_BYTE1) +- return -1; +- + while (--l > 0) + { + if (*++s == '\0') +@@ -1438,11 +1414,6 @@ pg_gbk_verifier(const unsigned char *s, int len) + if (len < l) + return -1; + +- if (l == 2 && +- s[0] == NONUTF8_INVALID_BYTE0 && +- s[1] == NONUTF8_INVALID_BYTE1) +- return -1; +- + while (--l > 0) + { + if (*++s == '\0') +@@ -1463,11 +1434,6 @@ pg_uhc_verifier(const unsigned char *s, int len) + if (len < l) + return -1; + +- if (l == 2 && +- s[0] == NONUTF8_INVALID_BYTE0 && +- s[1] == NONUTF8_INVALID_BYTE1) +- return -1; +- + while (--l > 0) + { + if (*++s == '\0') +@@ -1535,7 +1501,7 @@ pg_utf8_verifier(const unsigned char *s, int len) + * + * length is assumed to have been obtained by pg_utf_mblen(), and the + * caller must have checked that that many bytes are present in the buffer. +- */ ++ * + bool + pg_utf8_islegal(const unsigned char *source, int length) + { +@@ -1544,18 +1510,15 @@ pg_utf8_islegal(const unsigned char *source, int length) + switch (length) + { + default: +- /* reject lengths 5 and 6 for now */ + return false; + case 4: + a = source[3]; + if (a < 0x80 || a > 0xBF) + return false; +- /* FALL THRU */ + case 3: + a = source[2]; + if (a < 0x80 || a > 0xBF) + return false; +- /* FALL THRU */ + case 2: + a = source[1]; + switch (*source) +@@ -1581,7 +1544,6 @@ pg_utf8_islegal(const unsigned char *source, int length) + return false; + break; + } +- /* FALL THRU */ + case 1: + a = *source; + if (a >= 0x80 && a < 0xC2) +@@ -1592,7 +1554,7 @@ pg_utf8_islegal(const unsigned char *source, int length) + } + return true; + } +- ++*/ + #ifndef FRONTEND + + /* +@@ -1802,26 +1764,13 @@ pg_eucjp_increment(unsigned char *charptr, int length) + #endif /* !FRONTEND */ + + +-/* +- * Fills the provided buffer with two bytes such that: +- * pg_encoding_mblen(dst) == 2 && pg_encoding_verifymbstr(dst) == 0 +- */ +-void +-pg_encoding_set_invalid(int encoding, char *dst) +-{ +- Assert(pg_encoding_max_length(encoding) > 1); +- +- dst[0] = (encoding == PG_UTF8 ? 0xc0 : NONUTF8_INVALID_BYTE0); +- dst[1] = NONUTF8_INVALID_BYTE1; +-} +- + /* + *------------------------------------------------------------------- + * encoding info table + * XXX must be sorted by the same order as enum pg_enc (in mb/pg_wchar.h) + *------------------------------------------------------------------- + */ +-const pg_wchar_tbl pg_wchar_table[] = { ++const pg_wchar_tbl pg_wchar_table1[] = { + {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifier, 1}, /* PG_SQL_ASCII */ + {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3}, /* PG_EUC_JP */ + {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifier, 2}, /* PG_EUC_CN */ +@@ -1875,7 +1824,7 @@ pg_mic_mblen(const unsigned char *mbstr) + + /* + * Returns the byte length of a multibyte character. +- */ ++ * + int + pg_encoding_mblen(int encoding, const char *mbstr) + { +@@ -1883,10 +1832,10 @@ pg_encoding_mblen(int encoding, const char *mbstr) + pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) : + pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr)); + } +- ++*/ + /* + * Returns the display length of a multibyte character. +- */ ++ * + int + pg_encoding_dsplen(int encoding, const char *mbstr) + { +@@ -1894,12 +1843,12 @@ pg_encoding_dsplen(int encoding, const char *mbstr) + pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) : + pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr)); + } +- ++*/ + /* + * Verify the first multibyte character of the given string. + * Return its byte length if good, -1 if bad. (See comments above for + * full details of the mbverify API.) +- */ ++ * + int + pg_encoding_verifymb(int encoding, const char *mbstr, int len) + { +@@ -1907,24 +1856,18 @@ pg_encoding_verifymb(int encoding, const char *mbstr, int len) + pg_wchar_table[encoding].mbverify((const unsigned char *) mbstr, len) : + pg_wchar_table[PG_SQL_ASCII].mbverify((const unsigned char *) mbstr, len)); + } +- ++*/ + /* + * fetch maximum length of a given encoding +- */ ++ * + int + pg_encoding_max_length(int encoding) + { + Assert(PG_VALID_ENCODING(encoding)); + +- /* +- * Check for the encoding despite the assert, due to some mingw versions +- * otherwise issuing bogus warnings. +- */ +- return PG_VALID_ENCODING(encoding) ? +- pg_wchar_table[encoding].maxmblen : +- pg_wchar_table[PG_SQL_ASCII].maxmblen; ++ return pg_wchar_table[encoding].maxmblen; + } +- ++*/ + #ifndef FRONTEND + + /* +diff --git a/src/bin/scripts/dropdb.c b/src/bin/scripts/dropdb.c +index 0d636d0ef46..ed3a2c8c19a 100644 +--- a/src/bin/scripts/dropdb.c ++++ b/src/bin/scripts/dropdb.c +@@ -141,10 +141,9 @@ main(int argc, char *argv[]) + conn = connectMaintenanceDatabase(&cparams, progname, echo); + + initPQExpBuffer(&sql); +- appendPQExpBuffer(&sql, "DROP DATABASE %s%s%s;", ++ appendPQExpBuffer(&sql, "DROP DATABASE %s%s;", + (if_exists ? "IF EXISTS " : ""), +- fmtIdEnc(dbname, PQclientEncoding(conn)), +- force ? " WITH (FORCE)" : ""); ++ fmtIdEnc(dbname, PQclientEncoding(conn))); + + if (echo) + printf("%s\n", sql.data); +diff --git a/src/common/Makefile b/src/common/Makefile +index 2f22b9b101d..c26d938b31e 100644 +--- a/src/common/Makefile ++++ b/src/common/Makefile +@@ -50,7 +50,7 @@ OBJS_COMMON = base64.o config_info.o controldata_utils.o d2s.o exec.o f2s.o \ + file_perm.o ip.o keywords.o kwlookup.o link-canary.o md5.o \ + pg_lzcompress.o pgfnames.o psprintf.o relpath.o \ + rmtree.o saslprep.o scram-common.o string.o unicode_norm.o \ +- username.o wait_error.o ++ username.o wait_error.o wchar.o + + ifeq ($(with_openssl),yes) + OBJS_COMMON += sha2_openssl.o +diff --git a/src/common/wchar.c b/src/common/wchar.c +new file mode 100644 +index 00000000000..85822b2c3b5 +--- /dev/null ++++ b/src/common/wchar.c +@@ -0,0 +1,1728 @@ ++/*------------------------------------------------------------------------- ++ * ++ * wchar.c ++ * Functions for working with multibyte characters in various encodings. ++ * ++ * Portions Copyright (c) 1998-2020, PostgreSQL Global Development Group ++ * ++ * IDENTIFICATION ++ * src/common/wchar.c ++ * ++ *------------------------------------------------------------------------- ++ */ ++#include "c.h" ++ ++#include "mb/pg_wchar.h" ++ ++ ++/* ++ * In today's multibyte encodings other than UTF8, this two-byte sequence ++ * ensures pg_encoding_mblen() == 2 && pg_encoding_verifymbstr() == 0. ++ * ++ * For historical reasons, several verifychar implementations opt to reject ++ * this pair specifically. Byte pair range constraints, in encoding ++ * originator documentation, always excluded this pair. No core conversion ++ * could translate it. However, longstanding verifychar implementations ++ * accepted any non-NUL byte. big5_to_euc_tw and big5_to_mic even translate ++ * pairs not valid per encoding originator documentation. To avoid tightening ++ * core or non-core conversions in a security patch, we sought this one pair. ++ * ++ * PQescapeString() historically used spaces for BYTE1; many other values ++ * could suffice for BYTE1. ++ */ ++#define NONUTF8_INVALID_BYTE0 (0x8d) ++#define NONUTF8_INVALID_BYTE1 (' ') ++ ++ ++/* ++ * Operations on multi-byte encodings are driven by a table of helper ++ * functions. ++ * ++ * To add an encoding support, define mblen(), dsplen() and verifier() for ++ * the encoding. For server-encodings, also define mb2wchar() and wchar2mb() ++ * conversion functions. ++ * ++ * These functions generally assume that their input is validly formed. ++ * The "verifier" functions, further down in the file, have to be more ++ * paranoid. ++ * ++ * We expect that mblen() does not need to examine more than the first byte ++ * of the character to discover the correct length. GB18030 is an exception ++ * to that rule, though, as it also looks at second byte. But even that ++ * behaves in a predictable way, if you only pass the first byte: it will ++ * treat 4-byte encoded characters as two 2-byte encoded characters, which is ++ * good enough for all current uses. ++ * ++ * Note: for the display output of psql to work properly, the return values ++ * of the dsplen functions must conform to the Unicode standard. In particular ++ * the NUL character is zero width and control characters are generally ++ * width -1. It is recommended that non-ASCII encodings refer their ASCII ++ * subset to the ASCII routines to ensure consistency. ++ */ ++ ++/* ++ * SQL/ASCII ++ */ ++static int ++pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) ++{ ++ int cnt = 0; ++ ++ while (len > 0 && *from) ++ { ++ *to++ = *from++; ++ len--; ++ cnt++; ++ } ++ *to = 0; ++ return cnt; ++} ++ ++static int ++pg_ascii_mblen(const unsigned char *s) ++{ ++ return 1; ++} ++ ++static int ++pg_ascii_dsplen(const unsigned char *s) ++{ ++ if (*s == '\0') ++ return 0; ++ if (*s < 0x20 || *s == 0x7f) ++ return -1; ++ ++ return 1; ++} ++ ++/* ++ * EUC ++ */ ++static int ++pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) ++{ ++ int cnt = 0; ++ ++ while (len > 0 && *from) ++ { ++ if (*from == SS2 && len >= 2) /* JIS X 0201 (so called "1 byte ++ * KANA") */ ++ { ++ from++; ++ *to = (SS2 << 8) | *from++; ++ len -= 2; ++ } ++ else if (*from == SS3 && len >= 3) /* JIS X 0212 KANJI */ ++ { ++ from++; ++ *to = (SS3 << 16) | (*from++ << 8); ++ *to |= *from++; ++ len -= 3; ++ } ++ else if (IS_HIGHBIT_SET(*from) && len >= 2) /* JIS X 0208 KANJI */ ++ { ++ *to = *from++ << 8; ++ *to |= *from++; ++ len -= 2; ++ } ++ else /* must be ASCII */ ++ { ++ *to = *from++; ++ len--; ++ } ++ to++; ++ cnt++; ++ } ++ *to = 0; ++ return cnt; ++} ++ ++static inline int ++pg_euc_mblen(const unsigned char *s) ++{ ++ int len; ++ ++ if (*s == SS2) ++ len = 2; ++ else if (*s == SS3) ++ len = 3; ++ else if (IS_HIGHBIT_SET(*s)) ++ len = 2; ++ else ++ len = 1; ++ return len; ++} ++ ++static inline int ++pg_euc_dsplen(const unsigned char *s) ++{ ++ int len; ++ ++ if (*s == SS2) ++ len = 2; ++ else if (*s == SS3) ++ len = 2; ++ else if (IS_HIGHBIT_SET(*s)) ++ len = 2; ++ else ++ len = pg_ascii_dsplen(s); ++ return len; ++} ++ ++/* ++ * EUC_JP ++ */ ++static int ++pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) ++{ ++ return pg_euc2wchar_with_len(from, to, len); ++} ++ ++static int ++pg_eucjp_mblen(const unsigned char *s) ++{ ++ return pg_euc_mblen(s); ++} ++ ++static int ++pg_eucjp_dsplen(const unsigned char *s) ++{ ++ int len; ++ ++ if (*s == SS2) ++ len = 1; ++ else if (*s == SS3) ++ len = 2; ++ else if (IS_HIGHBIT_SET(*s)) ++ len = 2; ++ else ++ len = pg_ascii_dsplen(s); ++ return len; ++} ++ ++/* ++ * EUC_KR ++ */ ++static int ++pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) ++{ ++ return pg_euc2wchar_with_len(from, to, len); ++} ++ ++static int ++pg_euckr_mblen(const unsigned char *s) ++{ ++ return pg_euc_mblen(s); ++} ++ ++static int ++pg_euckr_dsplen(const unsigned char *s) ++{ ++ return pg_euc_dsplen(s); ++} ++ ++/* ++ * EUC_CN ++ * ++ */ ++static int ++pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) ++{ ++ int cnt = 0; ++ ++ while (len > 0 && *from) ++ { ++ if (*from == SS2 && len >= 3) /* code set 2 (unused?) */ ++ { ++ from++; ++ *to = (SS2 << 16) | (*from++ << 8); ++ *to |= *from++; ++ len -= 3; ++ } ++ else if (*from == SS3 && len >= 3) /* code set 3 (unused ?) */ ++ { ++ from++; ++ *to = (SS3 << 16) | (*from++ << 8); ++ *to |= *from++; ++ len -= 3; ++ } ++ else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 1 */ ++ { ++ *to = *from++ << 8; ++ *to |= *from++; ++ len -= 2; ++ } ++ else ++ { ++ *to = *from++; ++ len--; ++ } ++ to++; ++ cnt++; ++ } ++ *to = 0; ++ return cnt; ++} ++ ++static int ++pg_euccn_mblen(const unsigned char *s) ++{ ++ int len; ++ ++ if (IS_HIGHBIT_SET(*s)) ++ len = 2; ++ else ++ len = 1; ++ return len; ++} ++ ++static int ++pg_euccn_dsplen(const unsigned char *s) ++{ ++ int len; ++ ++ if (IS_HIGHBIT_SET(*s)) ++ len = 2; ++ else ++ len = pg_ascii_dsplen(s); ++ return len; ++} ++ ++/* ++ * EUC_TW ++ * ++ */ ++static int ++pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) ++{ ++ int cnt = 0; ++ ++ while (len > 0 && *from) ++ { ++ if (*from == SS2 && len >= 4) /* code set 2 */ ++ { ++ from++; ++ *to = (((uint32) SS2) << 24) | (*from++ << 16); ++ *to |= *from++ << 8; ++ *to |= *from++; ++ len -= 4; ++ } ++ else if (*from == SS3 && len >= 3) /* code set 3 (unused?) */ ++ { ++ from++; ++ *to = (SS3 << 16) | (*from++ << 8); ++ *to |= *from++; ++ len -= 3; ++ } ++ else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 2 */ ++ { ++ *to = *from++ << 8; ++ *to |= *from++; ++ len -= 2; ++ } ++ else ++ { ++ *to = *from++; ++ len--; ++ } ++ to++; ++ cnt++; ++ } ++ *to = 0; ++ return cnt; ++} ++ ++static int ++pg_euctw_mblen(const unsigned char *s) ++{ ++ int len; ++ ++ if (*s == SS2) ++ len = 4; ++ else if (*s == SS3) ++ len = 3; ++ else if (IS_HIGHBIT_SET(*s)) ++ len = 2; ++ else ++ len = 1; ++ return len; ++} ++ ++static int ++pg_euctw_dsplen(const unsigned char *s) ++{ ++ int len; ++ ++ if (*s == SS2) ++ len = 2; ++ else if (*s == SS3) ++ len = 2; ++ else if (IS_HIGHBIT_SET(*s)) ++ len = 2; ++ else ++ len = pg_ascii_dsplen(s); ++ return len; ++} ++ ++/* ++ * Convert pg_wchar to EUC_* encoding. ++ * caller must allocate enough space for "to", including a trailing zero! ++ * len: length of from. ++ * "from" not necessarily null terminated. ++ */ ++static int ++pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len) ++{ ++ int cnt = 0; ++ ++ while (len > 0 && *from) ++ { ++ unsigned char c; ++ ++ if ((c = (*from >> 24))) ++ { ++ *to++ = c; ++ *to++ = (*from >> 16) & 0xff; ++ *to++ = (*from >> 8) & 0xff; ++ *to++ = *from & 0xff; ++ cnt += 4; ++ } ++ else if ((c = (*from >> 16))) ++ { ++ *to++ = c; ++ *to++ = (*from >> 8) & 0xff; ++ *to++ = *from & 0xff; ++ cnt += 3; ++ } ++ else if ((c = (*from >> 8))) ++ { ++ *to++ = c; ++ *to++ = *from & 0xff; ++ cnt += 2; ++ } ++ else ++ { ++ *to++ = *from; ++ cnt++; ++ } ++ from++; ++ len--; ++ } ++ *to = 0; ++ return cnt; ++} ++ ++ ++/* ++ * JOHAB ++ */ ++static int ++pg_johab_mblen(const unsigned char *s) ++{ ++ return pg_euc_mblen(s); ++} ++ ++static int ++pg_johab_dsplen(const unsigned char *s) ++{ ++ return pg_euc_dsplen(s); ++} ++ ++/* ++ * convert UTF8 string to pg_wchar (UCS-4) ++ * caller must allocate enough space for "to", including a trailing zero! ++ * len: length of from. ++ * "from" not necessarily null terminated. ++ */ ++static int ++pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) ++{ ++ int cnt = 0; ++ uint32 c1, ++ c2, ++ c3, ++ c4; ++ ++ while (len > 0 && *from) ++ { ++ if ((*from & 0x80) == 0) ++ { ++ *to = *from++; ++ len--; ++ } ++ else if ((*from & 0xe0) == 0xc0) ++ { ++ if (len < 2) ++ break; /* drop trailing incomplete char */ ++ c1 = *from++ & 0x1f; ++ c2 = *from++ & 0x3f; ++ *to = (c1 << 6) | c2; ++ len -= 2; ++ } ++ else if ((*from & 0xf0) == 0xe0) ++ { ++ if (len < 3) ++ break; /* drop trailing incomplete char */ ++ c1 = *from++ & 0x0f; ++ c2 = *from++ & 0x3f; ++ c3 = *from++ & 0x3f; ++ *to = (c1 << 12) | (c2 << 6) | c3; ++ len -= 3; ++ } ++ else if ((*from & 0xf8) == 0xf0) ++ { ++ if (len < 4) ++ break; /* drop trailing incomplete char */ ++ c1 = *from++ & 0x07; ++ c2 = *from++ & 0x3f; ++ c3 = *from++ & 0x3f; ++ c4 = *from++ & 0x3f; ++ *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4; ++ len -= 4; ++ } ++ else ++ { ++ /* treat a bogus char as length 1; not ours to raise error */ ++ *to = *from++; ++ len--; ++ } ++ to++; ++ cnt++; ++ } ++ *to = 0; ++ return cnt; ++} ++ ++ ++/* ++ * Map a Unicode code point to UTF-8. utf8string must have 4 bytes of ++ * space allocated. ++ */ ++unsigned char * ++unicode_to_utf8(pg_wchar c, unsigned char *utf8string) ++{ ++ if (c <= 0x7F) ++ { ++ utf8string[0] = c; ++ } ++ else if (c <= 0x7FF) ++ { ++ utf8string[0] = 0xC0 | ((c >> 6) & 0x1F); ++ utf8string[1] = 0x80 | (c & 0x3F); ++ } ++ else if (c <= 0xFFFF) ++ { ++ utf8string[0] = 0xE0 | ((c >> 12) & 0x0F); ++ utf8string[1] = 0x80 | ((c >> 6) & 0x3F); ++ utf8string[2] = 0x80 | (c & 0x3F); ++ } ++ else ++ { ++ utf8string[0] = 0xF0 | ((c >> 18) & 0x07); ++ utf8string[1] = 0x80 | ((c >> 12) & 0x3F); ++ utf8string[2] = 0x80 | ((c >> 6) & 0x3F); ++ utf8string[3] = 0x80 | (c & 0x3F); ++ } ++ ++ return utf8string; ++} ++ ++/* ++ * Trivial conversion from pg_wchar to UTF-8. ++ * caller should allocate enough space for "to" ++ * len: length of from. ++ * "from" not necessarily null terminated. ++ */ ++static int ++pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len) ++{ ++ int cnt = 0; ++ ++ while (len > 0 && *from) ++ { ++ int char_len; ++ ++ unicode_to_utf8(*from, to); ++ char_len = pg_utf_mblen(to); ++ cnt += char_len; ++ to += char_len; ++ from++; ++ len--; ++ } ++ *to = 0; ++ return cnt; ++} ++ ++/* ++ * Return the byte length of a UTF8 character pointed to by s ++ * ++ * Note: in the current implementation we do not support UTF8 sequences ++ * of more than 4 bytes; hence do NOT return a value larger than 4. ++ * We return "1" for any leading byte that is either flat-out illegal or ++ * indicates a length larger than we support. ++ * ++ * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps ++ * other places would need to be fixed to change this. ++ */ ++int ++pg_utf_mblen(const unsigned char *s) ++{ ++ int len; ++ ++ if ((*s & 0x80) == 0) ++ len = 1; ++ else if ((*s & 0xe0) == 0xc0) ++ len = 2; ++ else if ((*s & 0xf0) == 0xe0) ++ len = 3; ++ else if ((*s & 0xf8) == 0xf0) ++ len = 4; ++#ifdef NOT_USED ++ else if ((*s & 0xfc) == 0xf8) ++ len = 5; ++ else if ((*s & 0xfe) == 0xfc) ++ len = 6; ++#endif ++ else ++ len = 1; ++ return len; ++} ++ ++/* ++ * This is an implementation of wcwidth() and wcswidth() as defined in ++ * "The Single UNIX Specification, Version 2, The Open Group, 1997" ++ * ++ * ++ * Markus Kuhn -- 2001-09-08 -- public domain ++ * ++ * customised for PostgreSQL ++ * ++ * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c ++ */ ++ ++struct mbinterval ++{ ++ unsigned short first; ++ unsigned short last; ++}; ++ ++/* auxiliary function for binary search in interval table */ ++static int ++mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max) ++{ ++ int min = 0; ++ int mid; ++ ++ if (ucs < table[0].first || ucs > table[max].last) ++ return 0; ++ while (max >= min) ++ { ++ mid = (min + max) / 2; ++ if (ucs > table[mid].last) ++ min = mid + 1; ++ else if (ucs < table[mid].first) ++ max = mid - 1; ++ else ++ return 1; ++ } ++ ++ return 0; ++} ++ ++ ++/* The following functions define the column width of an ISO 10646 ++ * character as follows: ++ * ++ * - The null character (U+0000) has a column width of 0. ++ * ++ * - Other C0/C1 control characters and DEL will lead to a return ++ * value of -1. ++ * ++ * - Non-spacing and enclosing combining characters (general ++ * category code Mn or Me in the Unicode database) have a ++ * column width of 0. ++ * ++ * - Other format characters (general category code Cf in the Unicode ++ * database) and ZERO WIDTH SPACE (U+200B) have a column width of 0. ++ * ++ * - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF) ++ * have a column width of 0. ++ * ++ * - Spacing characters in the East Asian Wide (W) or East Asian ++ * FullWidth (F) category as defined in Unicode Technical ++ * Report #11 have a column width of 2. ++ * ++ * - All remaining characters (including all printable ++ * ISO 8859-1 and WGL4 characters, Unicode control characters, ++ * etc.) have a column width of 1. ++ * ++ * This implementation assumes that wchar_t characters are encoded ++ * in ISO 10646. ++ */ ++ ++static int ++ucs_wcwidth(pg_wchar ucs) ++{ ++#include "common/unicode_combining_table.h" ++ ++ /* test for 8-bit control characters */ ++ if (ucs == 0) ++ return 0; ++ ++ if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff) ++ return -1; ++ ++ /* binary search in table of non-spacing characters */ ++ if (mbbisearch(ucs, combining, ++ sizeof(combining) / sizeof(struct mbinterval) - 1)) ++ return 0; ++ ++ /* ++ * if we arrive here, ucs is not a combining or C0/C1 control character ++ */ ++ ++ return 1 + ++ (ucs >= 0x1100 && ++ (ucs <= 0x115f || /* Hangul Jamo init. consonants */ ++ (ucs >= 0x2e80 && ucs <= 0xa4cf && (ucs & ~0x0011) != 0x300a && ++ ucs != 0x303f) || /* CJK ... Yi */ ++ (ucs >= 0xac00 && ucs <= 0xd7a3) || /* Hangul Syllables */ ++ (ucs >= 0xf900 && ucs <= 0xfaff) || /* CJK Compatibility ++ * Ideographs */ ++ (ucs >= 0xfe30 && ucs <= 0xfe6f) || /* CJK Compatibility Forms */ ++ (ucs >= 0xff00 && ucs <= 0xff5f) || /* Fullwidth Forms */ ++ (ucs >= 0xffe0 && ucs <= 0xffe6) || ++ (ucs >= 0x20000 && ucs <= 0x2ffff))); ++} ++ ++/* ++ * Convert a UTF-8 character to a Unicode code point. ++ * This is a one-character version of pg_utf2wchar_with_len. ++ * ++ * No error checks here, c must point to a long-enough string. ++ */ ++pg_wchar ++utf8_to_unicode(const unsigned char *c) ++{ ++ if ((*c & 0x80) == 0) ++ return (pg_wchar) c[0]; ++ else if ((*c & 0xe0) == 0xc0) ++ return (pg_wchar) (((c[0] & 0x1f) << 6) | ++ (c[1] & 0x3f)); ++ else if ((*c & 0xf0) == 0xe0) ++ return (pg_wchar) (((c[0] & 0x0f) << 12) | ++ ((c[1] & 0x3f) << 6) | ++ (c[2] & 0x3f)); ++ else if ((*c & 0xf8) == 0xf0) ++ return (pg_wchar) (((c[0] & 0x07) << 18) | ++ ((c[1] & 0x3f) << 12) | ++ ((c[2] & 0x3f) << 6) | ++ (c[3] & 0x3f)); ++ else ++ /* that is an invalid code on purpose */ ++ return 0xffffffff; ++} ++ ++static int ++pg_utf_dsplen(const unsigned char *s) ++{ ++ return ucs_wcwidth(utf8_to_unicode(s)); ++} ++ ++/* ++ * convert mule internal code to pg_wchar ++ * caller should allocate enough space for "to" ++ * len: length of from. ++ * "from" not necessarily null terminated. ++ */ ++static int ++pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) ++{ ++ int cnt = 0; ++ ++ while (len > 0 && *from) ++ { ++ if (IS_LC1(*from) && len >= 2) ++ { ++ *to = *from++ << 16; ++ *to |= *from++; ++ len -= 2; ++ } ++ else if (IS_LCPRV1(*from) && len >= 3) ++ { ++ from++; ++ *to = *from++ << 16; ++ *to |= *from++; ++ len -= 3; ++ } ++ else if (IS_LC2(*from) && len >= 3) ++ { ++ *to = *from++ << 16; ++ *to |= *from++ << 8; ++ *to |= *from++; ++ len -= 3; ++ } ++ else if (IS_LCPRV2(*from) && len >= 4) ++ { ++ from++; ++ *to = *from++ << 16; ++ *to |= *from++ << 8; ++ *to |= *from++; ++ len -= 4; ++ } ++ else ++ { /* assume ASCII */ ++ *to = (unsigned char) *from++; ++ len--; ++ } ++ to++; ++ cnt++; ++ } ++ *to = 0; ++ return cnt; ++} ++ ++/* ++ * convert pg_wchar to mule internal code ++ * caller should allocate enough space for "to" ++ * len: length of from. ++ * "from" not necessarily null terminated. ++ */ ++static int ++pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len) ++{ ++ int cnt = 0; ++ ++ while (len > 0 && *from) ++ { ++ unsigned char lb; ++ ++ lb = (*from >> 16) & 0xff; ++ if (IS_LC1(lb)) ++ { ++ *to++ = lb; ++ *to++ = *from & 0xff; ++ cnt += 2; ++ } ++ else if (IS_LC2(lb)) ++ { ++ *to++ = lb; ++ *to++ = (*from >> 8) & 0xff; ++ *to++ = *from & 0xff; ++ cnt += 3; ++ } ++ else if (IS_LCPRV1_A_RANGE(lb)) ++ { ++ *to++ = LCPRV1_A; ++ *to++ = lb; ++ *to++ = *from & 0xff; ++ cnt += 3; ++ } ++ else if (IS_LCPRV1_B_RANGE(lb)) ++ { ++ *to++ = LCPRV1_B; ++ *to++ = lb; ++ *to++ = *from & 0xff; ++ cnt += 3; ++ } ++ else if (IS_LCPRV2_A_RANGE(lb)) ++ { ++ *to++ = LCPRV2_A; ++ *to++ = lb; ++ *to++ = (*from >> 8) & 0xff; ++ *to++ = *from & 0xff; ++ cnt += 4; ++ } ++ else if (IS_LCPRV2_B_RANGE(lb)) ++ { ++ *to++ = LCPRV2_B; ++ *to++ = lb; ++ *to++ = (*from >> 8) & 0xff; ++ *to++ = *from & 0xff; ++ cnt += 4; ++ } ++ else ++ { ++ *to++ = *from & 0xff; ++ cnt += 1; ++ } ++ from++; ++ len--; ++ } ++ *to = 0; ++ return cnt; ++} ++ ++/* exported for direct use by conv.c */ ++int ++pg_mule_mblen(const unsigned char *s) ++{ ++ int len; ++ ++ if (IS_LC1(*s)) ++ len = 2; ++ else if (IS_LCPRV1(*s)) ++ len = 3; ++ else if (IS_LC2(*s)) ++ len = 3; ++ else if (IS_LCPRV2(*s)) ++ len = 4; ++ else ++ len = 1; /* assume ASCII */ ++ return len; ++} ++ ++static int ++pg_mule_dsplen(const unsigned char *s) ++{ ++ int len; ++ ++ /* ++ * Note: it's not really appropriate to assume that all multibyte charsets ++ * are double-wide on screen. But this seems an okay approximation for ++ * the MULE charsets we currently support. ++ */ ++ ++ if (IS_LC1(*s)) ++ len = 1; ++ else if (IS_LCPRV1(*s)) ++ len = 1; ++ else if (IS_LC2(*s)) ++ len = 2; ++ else if (IS_LCPRV2(*s)) ++ len = 2; ++ else ++ len = 1; /* assume ASCII */ ++ ++ return len; ++} ++ ++/* ++ * ISO8859-1 ++ */ ++static int ++pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len) ++{ ++ int cnt = 0; ++ ++ while (len > 0 && *from) ++ { ++ *to++ = *from++; ++ len--; ++ cnt++; ++ } ++ *to = 0; ++ return cnt; ++} ++ ++/* ++ * Trivial conversion from pg_wchar to single byte encoding. Just ignores ++ * high bits. ++ * caller should allocate enough space for "to" ++ * len: length of from. ++ * "from" not necessarily null terminated. ++ */ ++static int ++pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len) ++{ ++ int cnt = 0; ++ ++ while (len > 0 && *from) ++ { ++ *to++ = *from++; ++ len--; ++ cnt++; ++ } ++ *to = 0; ++ return cnt; ++} ++ ++static int ++pg_latin1_mblen(const unsigned char *s) ++{ ++ return 1; ++} ++ ++static int ++pg_latin1_dsplen(const unsigned char *s) ++{ ++ return pg_ascii_dsplen(s); ++} ++ ++/* ++ * SJIS ++ */ ++static int ++pg_sjis_mblen(const unsigned char *s) ++{ ++ int len; ++ ++ if (*s >= 0xa1 && *s <= 0xdf) ++ len = 1; /* 1 byte kana? */ ++ else if (IS_HIGHBIT_SET(*s)) ++ len = 2; /* kanji? */ ++ else ++ len = 1; /* should be ASCII */ ++ return len; ++} ++ ++static int ++pg_sjis_dsplen(const unsigned char *s) ++{ ++ int len; ++ ++ if (*s >= 0xa1 && *s <= 0xdf) ++ len = 1; /* 1 byte kana? */ ++ else if (IS_HIGHBIT_SET(*s)) ++ len = 2; /* kanji? */ ++ else ++ len = pg_ascii_dsplen(s); /* should be ASCII */ ++ return len; ++} ++ ++/* ++ * Big5 ++ */ ++static int ++pg_big5_mblen(const unsigned char *s) ++{ ++ int len; ++ ++ if (IS_HIGHBIT_SET(*s)) ++ len = 2; /* kanji? */ ++ else ++ len = 1; /* should be ASCII */ ++ return len; ++} ++ ++static int ++pg_big5_dsplen(const unsigned char *s) ++{ ++ int len; ++ ++ if (IS_HIGHBIT_SET(*s)) ++ len = 2; /* kanji? */ ++ else ++ len = pg_ascii_dsplen(s); /* should be ASCII */ ++ return len; ++} ++ ++/* ++ * GBK ++ */ ++static int ++pg_gbk_mblen(const unsigned char *s) ++{ ++ int len; ++ ++ if (IS_HIGHBIT_SET(*s)) ++ len = 2; /* kanji? */ ++ else ++ len = 1; /* should be ASCII */ ++ return len; ++} ++ ++static int ++pg_gbk_dsplen(const unsigned char *s) ++{ ++ int len; ++ ++ if (IS_HIGHBIT_SET(*s)) ++ len = 2; /* kanji? */ ++ else ++ len = pg_ascii_dsplen(s); /* should be ASCII */ ++ return len; ++} ++ ++/* ++ * UHC ++ */ ++static int ++pg_uhc_mblen(const unsigned char *s) ++{ ++ int len; ++ ++ if (IS_HIGHBIT_SET(*s)) ++ len = 2; /* 2byte? */ ++ else ++ len = 1; /* should be ASCII */ ++ return len; ++} ++ ++static int ++pg_uhc_dsplen(const unsigned char *s) ++{ ++ int len; ++ ++ if (IS_HIGHBIT_SET(*s)) ++ len = 2; /* 2byte? */ ++ else ++ len = pg_ascii_dsplen(s); /* should be ASCII */ ++ return len; ++} ++ ++/* ++ * GB18030 ++ * Added by Bill Huang , ++ */ ++ ++/* ++ * Unlike all other mblen() functions, this also looks at the second byte of ++ * the input. However, if you only pass the first byte of a multi-byte ++ * string, and \0 as the second byte, this still works in a predictable way: ++ * a 4-byte character will be reported as two 2-byte characters. That's ++ * enough for all current uses, as a client-only encoding. It works that ++ * way, because in any valid 4-byte GB18030-encoded character, the third and ++ * fourth byte look like a 2-byte encoded character, when looked at ++ * separately. ++ */ ++static int ++pg_gb18030_mblen(const unsigned char *s) ++{ ++ int len; ++ ++ if (!IS_HIGHBIT_SET(*s)) ++ len = 1; /* ASCII */ ++ else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39) ++ len = 4; ++ else ++ len = 2; ++ return len; ++} ++ ++static int ++pg_gb18030_dsplen(const unsigned char *s) ++{ ++ int len; ++ ++ if (IS_HIGHBIT_SET(*s)) ++ len = 2; ++ else ++ len = pg_ascii_dsplen(s); /* ASCII */ ++ return len; ++} ++ ++/* ++ *------------------------------------------------------------------- ++ * multibyte sequence validators ++ * ++ * These functions accept "s", a pointer to the first byte of a string, ++ * and "len", the remaining length of the string. If there is a validly ++ * encoded character beginning at *s, return its length in bytes; else ++ * return -1. ++ * ++ * The functions can assume that len > 0 and that *s != '\0', but they must ++ * test for and reject zeroes in any additional bytes of a multibyte character. ++ * ++ * Note that this definition allows the function for a single-byte ++ * encoding to be just "return 1". ++ *------------------------------------------------------------------- ++ */ ++ ++static int ++pg_ascii_verifier(const unsigned char *s, int len) ++{ ++ return 1; ++} ++ ++#define IS_EUC_RANGE_VALID(c) ((c) >= 0xa1 && (c) <= 0xfe) ++ ++static int ++pg_eucjp_verifier(const unsigned char *s, int len) ++{ ++ int l; ++ unsigned char c1, ++ c2; ++ ++ c1 = *s++; ++ ++ switch (c1) ++ { ++ case SS2: /* JIS X 0201 */ ++ l = 2; ++ if (l > len) ++ return -1; ++ c2 = *s++; ++ if (c2 < 0xa1 || c2 > 0xdf) ++ return -1; ++ break; ++ ++ case SS3: /* JIS X 0212 */ ++ l = 3; ++ if (l > len) ++ return -1; ++ c2 = *s++; ++ if (!IS_EUC_RANGE_VALID(c2)) ++ return -1; ++ c2 = *s++; ++ if (!IS_EUC_RANGE_VALID(c2)) ++ return -1; ++ break; ++ ++ default: ++ if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */ ++ { ++ l = 2; ++ if (l > len) ++ return -1; ++ if (!IS_EUC_RANGE_VALID(c1)) ++ return -1; ++ c2 = *s++; ++ if (!IS_EUC_RANGE_VALID(c2)) ++ return -1; ++ } ++ else ++ /* must be ASCII */ ++ { ++ l = 1; ++ } ++ break; ++ } ++ ++ return l; ++} ++ ++static int ++pg_euckr_verifier(const unsigned char *s, int len) ++{ ++ int l; ++ unsigned char c1, ++ c2; ++ ++ c1 = *s++; ++ ++ if (IS_HIGHBIT_SET(c1)) ++ { ++ l = 2; ++ if (l > len) ++ return -1; ++ if (!IS_EUC_RANGE_VALID(c1)) ++ return -1; ++ c2 = *s++; ++ if (!IS_EUC_RANGE_VALID(c2)) ++ return -1; ++ } ++ else ++ /* must be ASCII */ ++ { ++ l = 1; ++ } ++ ++ return l; ++} ++ ++/* EUC-CN byte sequences are exactly same as EUC-KR */ ++#define pg_euccn_verifier pg_euckr_verifier ++ ++static int ++pg_euctw_verifier(const unsigned char *s, int len) ++{ ++ int l; ++ unsigned char c1, ++ c2; ++ ++ c1 = *s++; ++ ++ switch (c1) ++ { ++ case SS2: /* CNS 11643 Plane 1-7 */ ++ l = 4; ++ if (l > len) ++ return -1; ++ c2 = *s++; ++ if (c2 < 0xa1 || c2 > 0xa7) ++ return -1; ++ c2 = *s++; ++ if (!IS_EUC_RANGE_VALID(c2)) ++ return -1; ++ c2 = *s++; ++ if (!IS_EUC_RANGE_VALID(c2)) ++ return -1; ++ break; ++ ++ case SS3: /* unused */ ++ return -1; ++ ++ default: ++ if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */ ++ { ++ l = 2; ++ if (l > len) ++ return -1; ++ /* no further range check on c1? */ ++ c2 = *s++; ++ if (!IS_EUC_RANGE_VALID(c2)) ++ return -1; ++ } ++ else ++ /* must be ASCII */ ++ { ++ l = 1; ++ } ++ break; ++ } ++ return l; ++} ++ ++static int ++pg_johab_verifier(const unsigned char *s, int len) ++{ ++ int l, ++ mbl; ++ unsigned char c; ++ ++ l = mbl = pg_johab_mblen(s); ++ ++ if (len < l) ++ return -1; ++ ++ if (!IS_HIGHBIT_SET(*s)) ++ return mbl; ++ ++ while (--l > 0) ++ { ++ c = *++s; ++ if (!IS_EUC_RANGE_VALID(c)) ++ return -1; ++ } ++ return mbl; ++} ++ ++static int ++pg_mule_verifier(const unsigned char *s, int len) ++{ ++ int l, ++ mbl; ++ unsigned char c; ++ ++ l = mbl = pg_mule_mblen(s); ++ ++ if (len < l) ++ return -1; ++ ++ while (--l > 0) ++ { ++ c = *++s; ++ if (!IS_HIGHBIT_SET(c)) ++ return -1; ++ } ++ return mbl; ++} ++ ++static int ++pg_latin1_verifier(const unsigned char *s, int len) ++{ ++ return 1; ++} ++ ++static int ++pg_sjis_verifier(const unsigned char *s, int len) ++{ ++ int l, ++ mbl; ++ unsigned char c1, ++ c2; ++ ++ l = mbl = pg_sjis_mblen(s); ++ ++ if (len < l) ++ return -1; ++ ++ if (l == 1) /* pg_sjis_mblen already verified it */ ++ return mbl; ++ ++ c1 = *s++; ++ c2 = *s; ++ if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2)) ++ return -1; ++ return mbl; ++} ++ ++static int ++pg_big5_verifier(const unsigned char *s, int len) ++{ ++ int l, ++ mbl; ++ ++ l = mbl = pg_big5_mblen(s); ++ ++ if (len < l) ++ return -1; ++ ++ if (l == 2 && ++ s[0] == NONUTF8_INVALID_BYTE0 && ++ s[1] == NONUTF8_INVALID_BYTE1) ++ return -1; ++ ++ while (--l > 0) ++ { ++ if (*++s == '\0') ++ return -1; ++ } ++ ++ return mbl; ++} ++ ++static int ++pg_gbk_verifier(const unsigned char *s, int len) ++{ ++ int l, ++ mbl; ++ ++ l = mbl = pg_gbk_mblen(s); ++ ++ if (len < l) ++ return -1; ++ ++ if (l == 2 && ++ s[0] == NONUTF8_INVALID_BYTE0 && ++ s[1] == NONUTF8_INVALID_BYTE1) ++ return -1; ++ ++ while (--l > 0) ++ { ++ if (*++s == '\0') ++ return -1; ++ } ++ ++ return mbl; ++} ++ ++static int ++pg_uhc_verifier(const unsigned char *s, int len) ++{ ++ int l, ++ mbl; ++ ++ l = mbl = pg_uhc_mblen(s); ++ ++ if (len < l) ++ return -1; ++ ++ if (l == 2 && ++ s[0] == NONUTF8_INVALID_BYTE0 && ++ s[1] == NONUTF8_INVALID_BYTE1) ++ return -1; ++ ++ while (--l > 0) ++ { ++ if (*++s == '\0') ++ return -1; ++ } ++ ++ return mbl; ++} ++ ++static int ++pg_gb18030_verifier(const unsigned char *s, int len) ++{ ++ int l; ++ ++ if (!IS_HIGHBIT_SET(*s)) ++ l = 1; /* ASCII */ ++ else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39) ++ { ++ /* Should be 4-byte, validate remaining bytes */ ++ if (*s >= 0x81 && *s <= 0xfe && ++ *(s + 2) >= 0x81 && *(s + 2) <= 0xfe && ++ *(s + 3) >= 0x30 && *(s + 3) <= 0x39) ++ l = 4; ++ else ++ l = -1; ++ } ++ else if (len >= 2 && *s >= 0x81 && *s <= 0xfe) ++ { ++ /* Should be 2-byte, validate */ ++ if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) || ++ (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe)) ++ l = 2; ++ else ++ l = -1; ++ } ++ else ++ l = -1; ++ return l; ++} ++ ++static int ++pg_utf8_verifier(const unsigned char *s, int len) ++{ ++ int l = pg_utf_mblen(s); ++ ++ if (len < l) ++ return -1; ++ ++ if (!pg_utf8_islegal(s, l)) ++ return -1; ++ ++ return l; ++} ++ ++/* ++ * Check for validity of a single UTF-8 encoded character ++ * ++ * This directly implements the rules in RFC3629. The bizarre-looking ++ * restrictions on the second byte are meant to ensure that there isn't ++ * more than one encoding of a given Unicode character point; that is, ++ * you may not use a longer-than-necessary byte sequence with high order ++ * zero bits to represent a character that would fit in fewer bytes. ++ * To do otherwise is to create security hazards (eg, create an apparent ++ * non-ASCII character that decodes to plain ASCII). ++ * ++ * length is assumed to have been obtained by pg_utf_mblen(), and the ++ * caller must have checked that that many bytes are present in the buffer. ++ */ ++bool ++pg_utf8_islegal(const unsigned char *source, int length) ++{ ++ unsigned char a; ++ ++ switch (length) ++ { ++ default: ++ /* reject lengths 5 and 6 for now */ ++ return false; ++ case 4: ++ a = source[3]; ++ if (a < 0x80 || a > 0xBF) ++ return false; ++ /* FALL THRU */ ++ case 3: ++ a = source[2]; ++ if (a < 0x80 || a > 0xBF) ++ return false; ++ /* FALL THRU */ ++ case 2: ++ a = source[1]; ++ switch (*source) ++ { ++ case 0xE0: ++ if (a < 0xA0 || a > 0xBF) ++ return false; ++ break; ++ case 0xED: ++ if (a < 0x80 || a > 0x9F) ++ return false; ++ break; ++ case 0xF0: ++ if (a < 0x90 || a > 0xBF) ++ return false; ++ break; ++ case 0xF4: ++ if (a < 0x80 || a > 0x8F) ++ return false; ++ break; ++ default: ++ if (a < 0x80 || a > 0xBF) ++ return false; ++ break; ++ } ++ /* FALL THRU */ ++ case 1: ++ a = *source; ++ if (a >= 0x80 && a < 0xC2) ++ return false; ++ if (a > 0xF4) ++ return false; ++ break; ++ } ++ return true; ++} ++ ++ ++/* ++ * Fills the provided buffer with two bytes such that: ++ * pg_encoding_mblen(dst) == 2 && pg_encoding_verifymbstr(dst) == 0 ++ */ ++void ++pg_encoding_set_invalid(int encoding, char *dst) ++{ ++ Assert(pg_encoding_max_length(encoding) > 1); ++ ++ dst[0] = (encoding == PG_UTF8 ? 0xc0 : NONUTF8_INVALID_BYTE0); ++ dst[1] = NONUTF8_INVALID_BYTE1; ++} ++ ++/* ++ *------------------------------------------------------------------- ++ * encoding info table ++ * XXX must be sorted by the same order as enum pg_enc (in mb/pg_wchar.h) ++ *------------------------------------------------------------------- ++ */ ++const pg_wchar_tbl pg_wchar_table[] = { ++ {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifier, 1}, /* PG_SQL_ASCII */ ++ {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3}, /* PG_EUC_JP */ ++ {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifier, 2}, /* PG_EUC_CN */ ++ {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifier, 3}, /* PG_EUC_KR */ ++ {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifier, 4}, /* PG_EUC_TW */ ++ {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3}, /* PG_EUC_JIS_2004 */ ++ {pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifier, 4}, /* PG_UTF8 */ ++ {pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifier, 4}, /* PG_MULE_INTERNAL */ ++ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN1 */ ++ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN2 */ ++ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN3 */ ++ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN4 */ ++ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN5 */ ++ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN6 */ ++ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN7 */ ++ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN8 */ ++ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN9 */ ++ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN10 */ ++ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1256 */ ++ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1258 */ ++ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN866 */ ++ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN874 */ ++ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_KOI8R */ ++ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1251 */ ++ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1252 */ ++ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-5 */ ++ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-6 */ ++ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-7 */ ++ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-8 */ ++ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1250 */ ++ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1253 */ ++ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1254 */ ++ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1255 */ ++ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1257 */ ++ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_KOI8U */ ++ {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2}, /* PG_SJIS */ ++ {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifier, 2}, /* PG_BIG5 */ ++ {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifier, 2}, /* PG_GBK */ ++ {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifier, 2}, /* PG_UHC */ ++ {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifier, 4}, /* PG_GB18030 */ ++ {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifier, 3}, /* PG_JOHAB */ ++ {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2} /* PG_SHIFT_JIS_2004 */ ++}; ++ ++/* ++ * Returns the byte length of a multibyte character. ++ * ++ * Caution: when dealing with text that is not certainly valid in the ++ * specified encoding, the result may exceed the actual remaining ++ * string length. Callers that are not prepared to deal with that ++ * should use pg_encoding_mblen_bounded() instead. ++ */ ++int ++pg_encoding_mblen(int encoding, const char *mbstr) ++{ ++ return (PG_VALID_ENCODING(encoding) ? ++ pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) : ++ pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr)); ++} ++ ++/* ++ * Returns the byte length of a multibyte character; but not more than ++ * the distance to end of string. ++ */ ++int ++pg_encoding_mblen_bounded(int encoding, const char *mbstr) ++{ ++ return strnlen(mbstr, pg_encoding_mblen(encoding, mbstr)); ++} ++ ++/* ++ * Returns the display length of a multibyte character. ++ */ ++int ++pg_encoding_dsplen(int encoding, const char *mbstr) ++{ ++ return (PG_VALID_ENCODING(encoding) ? ++ pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) : ++ pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr)); ++} ++ ++/* ++ * Verify the first multibyte character of the given string. ++ * Return its byte length if good, -1 if bad. (See comments above for ++ * full details of the mbverify API.) ++ */ ++int ++pg_encoding_verifymb(int encoding, const char *mbstr, int len) ++{ ++ return (PG_VALID_ENCODING(encoding) ? ++ pg_wchar_table[encoding].mbverify((const unsigned char *) mbstr, len) : ++ pg_wchar_table[PG_SQL_ASCII].mbverify((const unsigned char *) mbstr, len)); ++} ++ ++/* v14+ function name, for easier backpatching */ ++int ++pg_encoding_verifymbchar(int encoding, const char *mbstr, int len) ++{ ++ int ok_bytes = pg_encoding_verifymb(encoding, mbstr, len); ++ ++ if (ok_bytes == 0) ++ return -1; ++ return ok_bytes; ++} ++ ++/* replace v14+ function, adapted from pg_verify_mbstr_len */ ++int ++pg_encoding_verifymbstr(int encoding, const char *mbstr, int len) ++{ ++ mbverifier mbverify; ++ int ok_bytes; ++ ++ Assert(PG_VALID_ENCODING(encoding)); ++ ++ /* ++ * In single-byte encodings, we need only reject nulls (\0). ++ */ ++ if (pg_encoding_max_length(encoding) <= 1) ++ { ++ const char *nullpos = memchr(mbstr, 0, len); ++ ++ if (nullpos == NULL) ++ return len; ++ return nullpos - mbstr; ++ } ++ ++ /* fetch function pointer just once */ ++ mbverify = pg_wchar_table[encoding].mbverify; ++ ++ ok_bytes = 0; ++ ++ while (len > 0) ++ { ++ int l; ++ ++ /* fast path for ASCII-subset characters */ ++ if (!IS_HIGHBIT_SET(*mbstr)) ++ { ++ if (*mbstr != '\0') ++ { ++ ok_bytes++; ++ mbstr++; ++ len--; ++ continue; ++ } ++ return ok_bytes; ++ } ++ ++ l = (*mbverify) ((const unsigned char *) mbstr, len); ++ ++ if (l < 0) ++ return ok_bytes; ++ ++ mbstr += l; ++ len -= l; ++ ok_bytes += l; ++ } ++ return ok_bytes; ++} ++ ++/* ++ * fetch maximum length of a given encoding ++ */ ++int ++pg_encoding_max_length(int encoding) ++{ ++ Assert(PG_VALID_ENCODING(encoding)); ++ ++ /* ++ * Check for the encoding despite the assert, due to some mingw versions ++ * otherwise issuing bogus warnings. ++ */ ++ return PG_VALID_ENCODING(encoding) ? ++ pg_wchar_table[encoding].maxmblen : ++ pg_wchar_table[PG_SQL_ASCII].maxmblen; ++} +diff --git a/src/include/common/unicode_combining_table.h b/src/include/common/unicode_combining_table.h +new file mode 100644 +index 00000000000..a9f10c31bc8 +--- /dev/null ++++ b/src/include/common/unicode_combining_table.h +@@ -0,0 +1,196 @@ ++/* generated by src/common/unicode/generate-unicode_combining_table.pl, do not edit */ ++ ++static const struct mbinterval combining[] = { ++ {0x0300, 0x036F}, ++ {0x0483, 0x0489}, ++ {0x0591, 0x05BD}, ++ {0x05BF, 0x05BF}, ++ {0x05C1, 0x05C2}, ++ {0x05C4, 0x05C5}, ++ {0x05C7, 0x05C7}, ++ {0x0610, 0x061A}, ++ {0x064B, 0x065F}, ++ {0x0670, 0x0670}, ++ {0x06D6, 0x06DC}, ++ {0x06DF, 0x06E4}, ++ {0x06E7, 0x06E8}, ++ {0x06EA, 0x06ED}, ++ {0x0711, 0x0711}, ++ {0x0730, 0x074A}, ++ {0x07A6, 0x07B0}, ++ {0x07EB, 0x07F3}, ++ {0x07FD, 0x07FD}, ++ {0x0816, 0x0819}, ++ {0x081B, 0x0823}, ++ {0x0825, 0x0827}, ++ {0x0829, 0x082D}, ++ {0x0859, 0x085B}, ++ {0x08D3, 0x08E1}, ++ {0x08E3, 0x0902}, ++ {0x093A, 0x093A}, ++ {0x093C, 0x093C}, ++ {0x0941, 0x0948}, ++ {0x094D, 0x094D}, ++ {0x0951, 0x0957}, ++ {0x0962, 0x0963}, ++ {0x0981, 0x0981}, ++ {0x09BC, 0x09BC}, ++ {0x09C1, 0x09C4}, ++ {0x09CD, 0x09CD}, ++ {0x09E2, 0x09E3}, ++ {0x09FE, 0x0A02}, ++ {0x0A3C, 0x0A3C}, ++ {0x0A41, 0x0A51}, ++ {0x0A70, 0x0A71}, ++ {0x0A75, 0x0A75}, ++ {0x0A81, 0x0A82}, ++ {0x0ABC, 0x0ABC}, ++ {0x0AC1, 0x0AC8}, ++ {0x0ACD, 0x0ACD}, ++ {0x0AE2, 0x0AE3}, ++ {0x0AFA, 0x0B01}, ++ {0x0B3C, 0x0B3C}, ++ {0x0B3F, 0x0B3F}, ++ {0x0B41, 0x0B44}, ++ {0x0B4D, 0x0B56}, ++ {0x0B62, 0x0B63}, ++ {0x0B82, 0x0B82}, ++ {0x0BC0, 0x0BC0}, ++ {0x0BCD, 0x0BCD}, ++ {0x0C00, 0x0C00}, ++ {0x0C04, 0x0C04}, ++ {0x0C3E, 0x0C40}, ++ {0x0C46, 0x0C56}, ++ {0x0C62, 0x0C63}, ++ {0x0C81, 0x0C81}, ++ {0x0CBC, 0x0CBC}, ++ {0x0CBF, 0x0CBF}, ++ {0x0CC6, 0x0CC6}, ++ {0x0CCC, 0x0CCD}, ++ {0x0CE2, 0x0CE3}, ++ {0x0D00, 0x0D01}, ++ {0x0D3B, 0x0D3C}, ++ {0x0D41, 0x0D44}, ++ {0x0D4D, 0x0D4D}, ++ {0x0D62, 0x0D63}, ++ {0x0D81, 0x0D81}, ++ {0x0DCA, 0x0DCA}, ++ {0x0DD2, 0x0DD6}, ++ {0x0E31, 0x0E31}, ++ {0x0E34, 0x0E3A}, ++ {0x0E47, 0x0E4E}, ++ {0x0EB1, 0x0EB1}, ++ {0x0EB4, 0x0EBC}, ++ {0x0EC8, 0x0ECD}, ++ {0x0F18, 0x0F19}, ++ {0x0F35, 0x0F35}, ++ {0x0F37, 0x0F37}, ++ {0x0F39, 0x0F39}, ++ {0x0F71, 0x0F7E}, ++ {0x0F80, 0x0F84}, ++ {0x0F86, 0x0F87}, ++ {0x0F8D, 0x0FBC}, ++ {0x0FC6, 0x0FC6}, ++ {0x102D, 0x1030}, ++ {0x1032, 0x1037}, ++ {0x1039, 0x103A}, ++ {0x103D, 0x103E}, ++ {0x1058, 0x1059}, ++ {0x105E, 0x1060}, ++ {0x1071, 0x1074}, ++ {0x1082, 0x1082}, ++ {0x1085, 0x1086}, ++ {0x108D, 0x108D}, ++ {0x109D, 0x109D}, ++ {0x135D, 0x135F}, ++ {0x1712, 0x1714}, ++ {0x1732, 0x1734}, ++ {0x1752, 0x1753}, ++ {0x1772, 0x1773}, ++ {0x17B4, 0x17B5}, ++ {0x17B7, 0x17BD}, ++ {0x17C6, 0x17C6}, ++ {0x17C9, 0x17D3}, ++ {0x17DD, 0x17DD}, ++ {0x180B, 0x180D}, ++ {0x1885, 0x1886}, ++ {0x18A9, 0x18A9}, ++ {0x1920, 0x1922}, ++ {0x1927, 0x1928}, ++ {0x1932, 0x1932}, ++ {0x1939, 0x193B}, ++ {0x1A17, 0x1A18}, ++ {0x1A1B, 0x1A1B}, ++ {0x1A56, 0x1A56}, ++ {0x1A58, 0x1A60}, ++ {0x1A62, 0x1A62}, ++ {0x1A65, 0x1A6C}, ++ {0x1A73, 0x1A7F}, ++ {0x1AB0, 0x1B03}, ++ {0x1B34, 0x1B34}, ++ {0x1B36, 0x1B3A}, ++ {0x1B3C, 0x1B3C}, ++ {0x1B42, 0x1B42}, ++ {0x1B6B, 0x1B73}, ++ {0x1B80, 0x1B81}, ++ {0x1BA2, 0x1BA5}, ++ {0x1BA8, 0x1BA9}, ++ {0x1BAB, 0x1BAD}, ++ {0x1BE6, 0x1BE6}, ++ {0x1BE8, 0x1BE9}, ++ {0x1BED, 0x1BED}, ++ {0x1BEF, 0x1BF1}, ++ {0x1C2C, 0x1C33}, ++ {0x1C36, 0x1C37}, ++ {0x1CD0, 0x1CD2}, ++ {0x1CD4, 0x1CE0}, ++ {0x1CE2, 0x1CE8}, ++ {0x1CED, 0x1CED}, ++ {0x1CF4, 0x1CF4}, ++ {0x1CF8, 0x1CF9}, ++ {0x1DC0, 0x1DFF}, ++ {0x20D0, 0x20F0}, ++ {0x2CEF, 0x2CF1}, ++ {0x2D7F, 0x2D7F}, ++ {0x2DE0, 0x2DFF}, ++ {0x302A, 0x302D}, ++ {0x3099, 0x309A}, ++ {0xA66F, 0xA672}, ++ {0xA674, 0xA67D}, ++ {0xA69E, 0xA69F}, ++ {0xA6F0, 0xA6F1}, ++ {0xA802, 0xA802}, ++ {0xA806, 0xA806}, ++ {0xA80B, 0xA80B}, ++ {0xA825, 0xA826}, ++ {0xA82C, 0xA82C}, ++ {0xA8C4, 0xA8C5}, ++ {0xA8E0, 0xA8F1}, ++ {0xA8FF, 0xA8FF}, ++ {0xA926, 0xA92D}, ++ {0xA947, 0xA951}, ++ {0xA980, 0xA982}, ++ {0xA9B3, 0xA9B3}, ++ {0xA9B6, 0xA9B9}, ++ {0xA9BC, 0xA9BD}, ++ {0xA9E5, 0xA9E5}, ++ {0xAA29, 0xAA2E}, ++ {0xAA31, 0xAA32}, ++ {0xAA35, 0xAA36}, ++ {0xAA43, 0xAA43}, ++ {0xAA4C, 0xAA4C}, ++ {0xAA7C, 0xAA7C}, ++ {0xAAB0, 0xAAB0}, ++ {0xAAB2, 0xAAB4}, ++ {0xAAB7, 0xAAB8}, ++ {0xAABE, 0xAABF}, ++ {0xAAC1, 0xAAC1}, ++ {0xAAEC, 0xAAED}, ++ {0xAAF6, 0xAAF6}, ++ {0xABE5, 0xABE5}, ++ {0xABE8, 0xABE8}, ++ {0xABED, 0xABED}, ++ {0xFB1E, 0xFB1E}, ++ {0xFE00, 0xFE0F}, ++ {0xFE20, 0xFE2F}, ++}; +diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h +index 07b316fae1d..2bbdf2e792f 100644 +--- a/src/include/mb/pg_wchar.h ++++ b/src/include/mb/pg_wchar.h +@@ -521,6 +521,10 @@ extern int pg_valid_server_encoding_id(int encoding); + * of them do exist inside libpq. + */ + extern void pg_encoding_set_invalid(int encoding, char *dst); ++extern int pg_encoding_verifymbchar(int encoding, const char *mbstr, int len); ++extern int pg_encoding_verifymbstr(int encoding, const char *mbstr, int len); ++extern int pg_encoding_verifymb(int encoding, const char *mbstr, int len); ++ + extern int pg_mb2wchar(const char *from, pg_wchar *to); + extern int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len); + extern int pg_encoding_mb2wchar_with_len(int encoding, +diff --git a/src/interfaces/libpq/fe-exec.c b/src/interfaces/libpq/fe-exec.c +index a29d19a6268..dccee6a5597 100644 +--- a/src/interfaces/libpq/fe-exec.c ++++ b/src/interfaces/libpq/fe-exec.c +@@ -132,6 +132,8 @@ static int check_field_number(const PGresult *res, int field_num); + #define PGRESULT_SEP_ALLOC_THRESHOLD (PGRESULT_DATA_BLOCKSIZE / 2) + + ++ ++ + /* + * PQmakeEmptyPGresult + * returns a newly allocated, initialized PGresult with given status. +@@ -3403,9 +3405,10 @@ PQescapeStringInternal(PGconn *conn, + if (error) + *error = 1; + if (conn) +- libpq_append_conn_error(conn, "incomplete multibyte character"); ++ printfPQExpBuffer(&conn->errorMessage, ++ libpq_gettext("incomplete multibyte character\n")); + +- pg_encoding_set_invalid(encoding, target); ++ pg_encoding_set_invalid(encoding, target); + target += 2; + source++; + remaining--; +-- +2.39.5 (Apple Git-154) + + +From 27827fe62777a809cc3f5a54742839bc031b02f6 Mon Sep 17 00:00:00 2001 +From: Filip Janus +Date: Tue, 18 Mar 2025 10:11:09 +0100 +Subject: [PATCH 8/8] Fix failing dropdb.c + +--- + src/bin/scripts/dropdb.c | 4 ---- + 1 files changed, 0 insertion(+), 5 deletions(-) + +diff --git a/src/bin/scripts/dropdb.c b/src/bin/scripts/dropdb.c +index ed3a2c8c19a..140982717d9 100644 +--- a/src/bin/scripts/dropdb.c ++++ b/src/bin/scripts/dropdb.c +@@ -122,10 +122,6 @@ main(int argc, char *argv[]) + exit(0); + } + +- initPQExpBuffer(&sql); +- +- appendPQExpBuffer(&sql, "DROP DATABASE %s%s;", +- (if_exists ? "IF EXISTS " : ""), fmtIdEnc(dbname, PQclientEncoding(conn))); + + /* Avoid trying to drop postgres db while we are connected to it. */ + if (maintenance_db == NULL && strcmp(dbname, "postgres") == 0) +2.39.5 (Apple Git-154) + diff --git a/SOURCES/timezone-test-fix.patch b/SOURCES/timezone-test-fix.patch new file mode 100644 index 0000000..d97920d --- /dev/null +++ b/SOURCES/timezone-test-fix.patch @@ -0,0 +1,25 @@ +From 9d18b30ac7a17d70ee789b710865bd20b206023d Mon Sep 17 00:00:00 2001 +From: Filip Janus +Date: Tue, 18 Mar 2025 10:11:09 +0100 +Subject: [PATCH] Fix failing test regardless the CVE-2025-1094 fix + +--- + src/test/regress/expected/timestamptz.out | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/src/test/regress/expected/timestamptz.out b/src/test/regress/expected/timestamptz.out +index 55efd183868..5964b65bc6b 100644 +--- a/src/test/regress/expected/timestamptz.out ++++ b/src/test/regress/expected/timestamptz.out +@@ -2060,7 +2060,7 @@ SELECT make_timestamptz(2014, 12, 10, 0, 0, 0, 'Europe/Prague') AT TIME ZONE 'UT + SELECT make_timestamptz(1846, 12, 10, 0, 0, 0, 'Asia/Manila') AT TIME ZONE 'UTC'; + timezone + -------------------------- +- Wed Dec 09 15:56:00 1846 ++ Wed Dec 09 15:56:08 1846 + (1 row) + + SELECT make_timestamptz(1881, 12, 10, 0, 0, 0, 'Europe/Paris') AT TIME ZONE 'UTC'; +-- +2.39.5 (Apple Git-154) + diff --git a/SPECS/postgresql.spec b/SPECS/postgresql.spec index 928adfc..b138d42 100644 --- a/SPECS/postgresql.spec +++ b/SPECS/postgresql.spec @@ -60,7 +60,7 @@ Summary: PostgreSQL client programs Name: postgresql %global majorversion 12 Version: %{majorversion}.22 -Release: 1%{?dist} +Release: 3%{?dist} # The PostgreSQL license is very similar to other MIT licenses, but the OSI # recognizes it as an independent license, so we do as well. @@ -108,6 +108,8 @@ Patch6: postgresql-man.patch Patch8: postgresql-external-libpq.patch Patch9: postgresql-server-pg_config.patch Patch10: postgresql-12.5-contrib-dblink-expected-out.patch +Patch11: backport-cve-2025-1094.patch +Patch12: timezone-test-fix.patch BuildRequires: gcc BuildRequires: perl(ExtUtils::MakeMaker) glibc-devel bison flex gawk @@ -369,6 +371,8 @@ benchmarks. %patch8 -p1 %patch9 -p1 %patch10 -p1 +%patch11 -p1 +%patch12 -p1 # We used to run autoconf here, but there's no longer any real need to, # since Postgres ships with a reasonably modern configure script. @@ -1224,6 +1228,12 @@ make -C postgresql-setup-%{setup_version} check %changelog +* Tue Mar 18 2025 Filip Janus - 12.22-3 +- Fix backport for CVE-2025-1094 + +* Tue Mar 18 2025 Filip Janus - 12.22-2 +- Backport fix for CVE-2025-1094 + * Thu Nov 21 2024 Lukas Javorsky - 12.22-1 - Update to 12.22 - Fixes: CVE-2024-10976 CVE-2024-10978