postgresql/SOURCES/backport-cve-2025-1094.patch
2025-03-26 13:30:30 +00:00

3671 lines
99 KiB
Diff

From 62235454d50a62138341a87be065e4681684753a Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Mon, 10 Feb 2025 10:03:37 -0500
Subject: [PATCH 1/8] Backport upstream commit
4dc28963533704fc7dd922b9447467466a233d89 Add pg_encoding_set_invalid()
There are cases where we cannot / do not want to error out for invalidly
encoded input. In such cases it can be useful to replace e.g. an incomplete
multi-byte characters with bytes that will trigger an error when getting
validated as part of a larger string.
Unfortunately, until now, for some encoding no such sequence existed. For
those encodings this commit removes one previously accepted input combination
- we consider that to be ok, as the chosen bytes are outside of the valid
ranges for the encodings, we just previously failed to detect that.
As we cannot add a new field to pg_wchar_table without breaking ABI, this is
implemented "in-line" in the newly added function.
---
src/backend/utils/mb/wchar.c | 55 +++++++++++++++++++++++++++++++++++-
src/include/mb/pg_wchar.h | 1 +
2 files changed, 55 insertions(+), 1 deletion(-)
diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c
index 1b5ce1740c0..872241cc804 100644
--- a/src/backend/utils/mb/wchar.c
+++ b/src/backend/utils/mb/wchar.c
@@ -14,6 +14,25 @@
#include "mb/pg_wchar.h"
+/*
+ * In today's multibyte encodings other than UTF8, this two-byte sequence
+ * ensures pg_encoding_mblen() == 2 && pg_encoding_verifymbstr() == 0.
+ *
+ * For historical reasons, several verifychar implementations opt to reject
+ * this pair specifically. Byte pair range constraints, in encoding
+ * originator documentation, always excluded this pair. No core conversion
+ * could translate it. However, longstanding verifychar implementations
+ * accepted any non-NUL byte. big5_to_euc_tw and big5_to_mic even translate
+ * pairs not valid per encoding originator documentation. To avoid tightening
+ * core or non-core conversions in a security patch, we sought this one pair.
+ *
+ * PQescapeString() historically used spaces for BYTE1; many other values
+ * could suffice for BYTE1.
+ */
+#define NONUTF8_INVALID_BYTE0 (0x8d)
+#define NONUTF8_INVALID_BYTE1 (' ')
+
+
/*
* Operations on multi-byte encodings are driven by a table of helper
* functions.
@@ -1394,6 +1413,11 @@ pg_big5_verifier(const unsigned char *s, int len)
if (len < l)
return -1;
+ if (l == 2 &&
+ s[0] == NONUTF8_INVALID_BYTE0 &&
+ s[1] == NONUTF8_INVALID_BYTE1)
+ return -1;
+
while (--l > 0)
{
if (*++s == '\0')
@@ -1414,6 +1438,11 @@ pg_gbk_verifier(const unsigned char *s, int len)
if (len < l)
return -1;
+ if (l == 2 &&
+ s[0] == NONUTF8_INVALID_BYTE0 &&
+ s[1] == NONUTF8_INVALID_BYTE1)
+ return -1;
+
while (--l > 0)
{
if (*++s == '\0')
@@ -1434,6 +1463,11 @@ pg_uhc_verifier(const unsigned char *s, int len)
if (len < l)
return -1;
+ if (l == 2 &&
+ s[0] == NONUTF8_INVALID_BYTE0 &&
+ s[1] == NONUTF8_INVALID_BYTE1)
+ return -1;
+
while (--l > 0)
{
if (*++s == '\0')
@@ -1768,6 +1802,19 @@ pg_eucjp_increment(unsigned char *charptr, int length)
#endif /* !FRONTEND */
+/*
+ * Fills the provided buffer with two bytes such that:
+ * pg_encoding_mblen(dst) == 2 && pg_encoding_verifymbstr(dst) == 0
+ */
+void
+pg_encoding_set_invalid(int encoding, char *dst)
+{
+ Assert(pg_encoding_max_length(encoding) > 1);
+
+ dst[0] = (encoding == PG_UTF8 ? 0xc0 : NONUTF8_INVALID_BYTE0);
+ dst[1] = NONUTF8_INVALID_BYTE1;
+}
+
/*
*-------------------------------------------------------------------
* encoding info table
@@ -1869,7 +1916,13 @@ pg_encoding_max_length(int encoding)
{
Assert(PG_VALID_ENCODING(encoding));
- return pg_wchar_table[encoding].maxmblen;
+ /*
+ * Check for the encoding despite the assert, due to some mingw versions
+ * otherwise issuing bogus warnings.
+ */
+ return PG_VALID_ENCODING(encoding) ?
+ pg_wchar_table[encoding].maxmblen :
+ pg_wchar_table[PG_SQL_ASCII].maxmblen;
}
#ifndef FRONTEND
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index ec101a834ef..07b316fae1d 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -520,6 +520,7 @@ extern int pg_valid_server_encoding_id(int encoding);
* Remaining functions are not considered part of libpq's API, though many
* of them do exist inside libpq.
*/
+extern void pg_encoding_set_invalid(int encoding, char *dst);
extern int pg_mb2wchar(const char *from, pg_wchar *to);
extern int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len);
extern int pg_encoding_mb2wchar_with_len(int encoding,
--
2.39.5 (Apple Git-154)
From 581adbfe8c9db2e641705b308a74e5b6d89c61a6 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Mon, 10 Feb 2025 10:03:37 -0500
Subject: [PATCH 2/8] Backport upstream
commit:3e98c8ce50e46d58b91bf3ea806e995296dc5b91 Specify the encoding of input
to fmtId()
This commit adds fmtIdEnc() and fmtQualifiedIdEnc(), which allow to specify
the encoding as an explicit argument. Additionally setFmtEncoding() is
provided, which defines the encoding when no explicit encoding is provided, to
avoid breaking all code using fmtId().
All users of fmtId()/fmtQualifiedId() are either converted to the explicit
version or a call to setFmtEncoding() has been added.
This commit does not yet utilize the now well-defined encoding, that will
happen in a subsequent commit.
---
src/bin/pg_dump/pg_backup_archiver.c | 1 +
src/bin/pg_dump/pg_dump.c | 1 +
src/bin/pg_dump/pg_dumpall.c | 1 +
src/bin/psql/command.c | 3 +
src/bin/scripts/common.c | 5 +-
src/bin/scripts/createdb.c | 2 +
src/bin/scripts/createuser.c | 2 +
src/bin/scripts/dropdb.c | 8 ++-
src/bin/scripts/dropuser.c | 3 +-
src/bin/scripts/reindexdb.c | 4 +-
src/bin/scripts/vacuumdb.c | 5 +-
src/fe_utils/string_utils.c | 84 ++++++++++++++++++++++++++--
src/include/fe_utils/string_utils.h | 5 +-
13 files changed, 109 insertions(+), 15 deletions(-)
diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c
index 6476f7119af..489a84aca3d 100644
--- a/src/bin/pg_dump/pg_backup_archiver.c
+++ b/src/bin/pg_dump/pg_backup_archiver.c
@@ -2731,6 +2731,7 @@ processEncodingEntry(ArchiveHandle *AH, TocEntry *te)
fatal("unrecognized encoding \"%s\"",
ptr1);
AH->public.encoding = encoding;
+ setFmtEncoding(encoding);
}
else
fatal("invalid ENCODING item: %s",
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index 67a3714c62c..53fc95f3033 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -1085,6 +1085,7 @@ setup_connection(Archive *AH, const char *dumpencoding,
* we know how to escape strings.
*/
AH->encoding = PQclientEncoding(conn);
+ setFmtEncoding(AH->encoding);
std_strings = PQparameterStatus(conn, "standard_conforming_strings");
AH->std_strings = (std_strings && strcmp(std_strings, "on") == 0);
diff --git a/src/bin/pg_dump/pg_dumpall.c b/src/bin/pg_dump/pg_dumpall.c
index 27093220ab9..a44cd765c45 100644
--- a/src/bin/pg_dump/pg_dumpall.c
+++ b/src/bin/pg_dump/pg_dumpall.c
@@ -508,6 +508,7 @@ main(int argc, char *argv[])
* we know how to escape strings.
*/
encoding = PQclientEncoding(conn);
+ setFmtEncoding(encoding);
std_strings = PQparameterStatus(conn, "standard_conforming_strings");
if (!std_strings)
std_strings = "off";
diff --git a/src/bin/psql/command.c b/src/bin/psql/command.c
index 8889f833714..66e7fb4cf31 100644
--- a/src/bin/psql/command.c
+++ b/src/bin/psql/command.c
@@ -1183,6 +1183,7 @@ exec_command_encoding(PsqlScanState scan_state, bool active_branch)
/* save encoding info into psql internal data */
pset.encoding = PQclientEncoding(pset.db);
pset.popt.topt.encoding = pset.encoding;
+ setFmtEncoding(pset.encoding);
SetVariable(pset.vars, "ENCODING",
pg_encoding_to_char(pset.encoding));
}
@@ -3467,6 +3468,8 @@ SyncVariables(void)
pset.popt.topt.encoding = pset.encoding;
pset.sversion = PQserverVersion(pset.db);
+ setFmtEncoding(pset.encoding);
+
SetVariable(pset.vars, "DBNAME", PQdb(pset.db));
SetVariable(pset.vars, "USER", PQuser(pset.db));
SetVariable(pset.vars, "HOST", PQhost(pset.db));
diff --git a/src/bin/scripts/common.c b/src/bin/scripts/common.c
index 2de696c19ef..ff79421a31d 100644
--- a/src/bin/scripts/common.c
+++ b/src/bin/scripts/common.c
@@ -376,8 +376,9 @@ appendQualifiedRelation(PQExpBuffer buf, const char *spec,
exit(1);
}
appendPQExpBufferStr(buf,
- fmtQualifiedId(PQgetvalue(res, 0, 1),
- PQgetvalue(res, 0, 0)));
+ fmtQualifiedIdEnc(PQgetvalue(res, 0, 1),
+ PQgetvalue(res, 0, 0),
+ PQclientEncoding(conn)));
appendPQExpBufferStr(buf, columns);
PQclear(res);
termPQExpBuffer(&sql);
diff --git a/src/bin/scripts/createdb.c b/src/bin/scripts/createdb.c
index b4d3e134d93..d9f55cc9f5d 100644
--- a/src/bin/scripts/createdb.c
+++ b/src/bin/scripts/createdb.c
@@ -190,6 +190,8 @@ main(int argc, char *argv[])
conn = connectMaintenanceDatabase(&cparams, progname, echo);
+ setFmtEncoding(PQclientEncoding(conn));
+
initPQExpBuffer(&sql);
appendPQExpBuffer(&sql, "CREATE DATABASE %s",
diff --git a/src/bin/scripts/createuser.c b/src/bin/scripts/createuser.c
index dbc2c2a58cd..7ec8ee51be7 100644
--- a/src/bin/scripts/createuser.c
+++ b/src/bin/scripts/createuser.c
@@ -271,6 +271,8 @@ main(int argc, char *argv[])
conn = connectMaintenanceDatabase(&cparams, progname, echo);
+ setFmtEncoding(PQclientEncoding(conn));
+
initPQExpBuffer(&sql);
printfPQExpBuffer(&sql, "CREATE ROLE %s", fmtId(newuser));
diff --git a/src/bin/scripts/dropdb.c b/src/bin/scripts/dropdb.c
index ffdf12bfea7..0d636d0ef46 100644
--- a/src/bin/scripts/dropdb.c
+++ b/src/bin/scripts/dropdb.c
@@ -125,7 +125,7 @@ main(int argc, char *argv[])
initPQExpBuffer(&sql);
appendPQExpBuffer(&sql, "DROP DATABASE %s%s;",
- (if_exists ? "IF EXISTS " : ""), fmtId(dbname));
+ (if_exists ? "IF EXISTS " : ""), fmtIdEnc(dbname, PQclientEncoding(conn)));
/* Avoid trying to drop postgres db while we are connected to it. */
if (maintenance_db == NULL && strcmp(dbname, "postgres") == 0)
@@ -140,6 +140,12 @@ main(int argc, char *argv[])
conn = connectMaintenanceDatabase(&cparams, progname, echo);
+ initPQExpBuffer(&sql);
+ appendPQExpBuffer(&sql, "DROP DATABASE %s%s%s;",
+ (if_exists ? "IF EXISTS " : ""),
+ fmtIdEnc(dbname, PQclientEncoding(conn)),
+ force ? " WITH (FORCE)" : "");
+
if (echo)
printf("%s\n", sql.data);
result = PQexec(conn, sql.data);
diff --git a/src/bin/scripts/dropuser.c b/src/bin/scripts/dropuser.c
index a8be6b0784b..26523f85784 100644
--- a/src/bin/scripts/dropuser.c
+++ b/src/bin/scripts/dropuser.c
@@ -143,7 +143,8 @@ main(int argc, char *argv[])
initPQExpBuffer(&sql);
appendPQExpBuffer(&sql, "DROP ROLE %s%s;",
- (if_exists ? "IF EXISTS " : ""), fmtId(dropuser));
+ (if_exists ? "IF EXISTS " : ""),
+ fmtIdEnc(dropuser, PQclientEncoding(conn)));
if (echo)
printf("%s\n", sql.data);
diff --git a/src/bin/scripts/reindexdb.c b/src/bin/scripts/reindexdb.c
index 39b4078b411..b96d0ff54cf 100644
--- a/src/bin/scripts/reindexdb.c
+++ b/src/bin/scripts/reindexdb.c
@@ -325,7 +325,7 @@ reindex_one_database(const ConnParams *cparams,
else if (strcmp(type, "SCHEMA") == 0)
appendPQExpBufferStr(&sql, name);
else if (strcmp(type, "DATABASE") == 0)
- appendPQExpBufferStr(&sql, fmtId(PQdb(conn)));
+ appendPQExpBufferStr(&sql, fmtIdEnc(PQdb(conn),PQclientEncoding(conn)));
appendPQExpBufferChar(&sql, ';');
if (!executeMaintenanceCommand(conn, sql.data, echo))
@@ -403,7 +403,7 @@ reindex_system_catalogs(const ConnParams *cparams,
appendPQExpBufferStr(&sql, " SYSTEM ");
if (concurrently)
appendPQExpBuffer(&sql, "CONCURRENTLY ");
- appendPQExpBufferStr(&sql, fmtId(PQdb(conn)));
+ appendPQExpBufferStr(&sql, fmtIdEnc(PQdb(conn),PQclientEncoding(conn)));
appendPQExpBufferChar(&sql, ';');
if (!executeMaintenanceCommand(conn, sql.data, echo))
diff --git a/src/bin/scripts/vacuumdb.c b/src/bin/scripts/vacuumdb.c
index 6ade0c31a9d..8f9ce6529dc 100644
--- a/src/bin/scripts/vacuumdb.c
+++ b/src/bin/scripts/vacuumdb.c
@@ -602,8 +602,9 @@ vacuum_one_database(const ConnParams *cparams,
for (i = 0; i < ntups; i++)
{
appendPQExpBufferStr(&buf,
- fmtQualifiedId(PQgetvalue(res, i, 1),
- PQgetvalue(res, i, 0)));
+ fmtQualifiedIdEnc(PQgetvalue(res, i, 1),
+ PQgetvalue(res, i, 0),
+ PQclientEncoding(conn)));
if (tables_listed && !PQgetisnull(res, i, 2))
appendPQExpBufferStr(&buf, PQgetvalue(res, i, 2));
diff --git a/src/fe_utils/string_utils.c b/src/fe_utils/string_utils.c
index d5757becef2..05f0bd2576d 100644
--- a/src/fe_utils/string_utils.c
+++ b/src/fe_utils/string_utils.c
@@ -18,6 +18,7 @@
#include <ctype.h>
#include "fe_utils/string_utils.h"
+#include "mb/pg_wchar.h"
#include "common/keywords.h"
@@ -28,6 +29,8 @@ static PQExpBuffer defaultGetLocalPQExpBuffer(void);
int quote_all_identifiers = 0;
PQExpBuffer (*getLocalPQExpBuffer) (void) = defaultGetLocalPQExpBuffer;
+static int fmtIdEncoding = -1;
+
/*
* Returns a temporary PQExpBuffer, valid until the next call to the function.
@@ -56,14 +59,48 @@ defaultGetLocalPQExpBuffer(void)
return id_return;
}
+/*
+ * Set the encoding that fmtId() and fmtQualifiedId() use.
+ *
+ * This is not safe against multiple connections having different encodings,
+ * but there is no real other way to address the need to know the encoding for
+ * fmtId()/fmtQualifiedId() input for safe escaping. Eventually we should get
+ * rid of fmtId().
+ */
+void
+setFmtEncoding(int encoding)
+{
+ fmtIdEncoding = encoding;
+}
+
+/*
+ * Return the currently configured encoding for fmtId() and fmtQualifiedId().
+ */
+static int
+getFmtEncoding(void)
+{
+ if (fmtIdEncoding != -1)
+ return fmtIdEncoding;
+
+ /*
+ * In assertion builds it seems best to fail hard if the encoding was not
+ * set, to make it easier to find places with missing calls. But in
+ * production builds that seems like a bad idea, thus we instead just
+ * default to UTF-8.
+ */
+ Assert(fmtIdEncoding != -1);
+
+ return PG_UTF8;
+}
+
/*
* Quotes input string if it's not a legitimate SQL identifier as-is.
*
- * Note that the returned string must be used before calling fmtId again,
+ * Note that the returned string must be used before calling fmtIdEnc again,
* since we re-use the same return buffer each time.
*/
const char *
-fmtId(const char *rawid)
+fmtIdEnc(const char *rawid, int encoding)
{
PQExpBuffer id_return = getLocalPQExpBuffer();
@@ -136,7 +173,24 @@ fmtId(const char *rawid)
}
/*
- * fmtQualifiedId - construct a schema-qualified name, with quoting as needed.
+ * Quotes input string if it's not a legitimate SQL identifier as-is.
+ *
+ * Note that the returned string must be used before calling fmtId again,
+ * since we re-use the same return buffer each time.
+ *
+ * NB: This assumes setFmtEncoding() previously has been called to configure
+ * the encoding of rawid. It is preferable to use fmtIdEnc() with an
+ * explicit encoding.
+ */
+const char *
+fmtId(const char *rawid)
+{
+ return fmtIdEnc(rawid, getFmtEncoding());
+}
+
+/*
+ * fmtQualifiedIdEnc - construct a schema-qualified name, with quoting as
+ * needed.
*
* Like fmtId, use the result before calling again.
*
@@ -144,7 +198,7 @@ fmtId(const char *rawid)
* use that buffer until we're finished with calling fmtId().
*/
const char *
-fmtQualifiedId(const char *schema, const char *id)
+fmtQualifiedIdEnc(const char *schema, const char *id, int encoding)
{
PQExpBuffer id_return;
PQExpBuffer lcl_pqexp = createPQExpBuffer();
@@ -152,9 +206,9 @@ fmtQualifiedId(const char *schema, const char *id)
/* Some callers might fail to provide a schema name */
if (schema && *schema)
{
- appendPQExpBuffer(lcl_pqexp, "%s.", fmtId(schema));
+ appendPQExpBuffer(lcl_pqexp, "%s.", fmtIdEnc(schema, encoding));
}
- appendPQExpBufferStr(lcl_pqexp, fmtId(id));
+ appendPQExpBufferStr(lcl_pqexp, fmtIdEnc(id, encoding));
id_return = getLocalPQExpBuffer();
@@ -164,6 +218,24 @@ fmtQualifiedId(const char *schema, const char *id)
return id_return->data;
}
+/*
+ * fmtQualifiedId - construct a schema-qualified name, with quoting as needed.
+ *
+ * Like fmtId, use the result before calling again.
+ *
+ * Since we call fmtId and it also uses getLocalPQExpBuffer() we cannot
+ * use that buffer until we're finished with calling fmtId().
+ *
+ * NB: This assumes setFmtEncoding() previously has been called to configure
+ * the encoding of schema/id. It is preferable to use fmtQualifiedIdEnc()
+ * with an explicit encoding.
+ */
+const char *
+fmtQualifiedId(const char *schema, const char *id)
+{
+ return fmtQualifiedIdEnc(schema, id, getFmtEncoding());
+}
+
/*
* Format a Postgres version number (in the PG_VERSION_NUM integer format
diff --git a/src/include/fe_utils/string_utils.h b/src/include/fe_utils/string_utils.h
index 8c13cc0a66d..37f17f0b370 100644
--- a/src/include/fe_utils/string_utils.h
+++ b/src/include/fe_utils/string_utils.h
@@ -24,8 +24,11 @@ extern int quote_all_identifiers;
extern PQExpBuffer (*getLocalPQExpBuffer) (void);
/* Functions */
-extern const char *fmtId(const char *identifier);
+extern const char *fmtId(const char *rawid);
+extern const char *fmtIdEnc(const char *rawid, int encoding);
extern const char *fmtQualifiedId(const char *schema, const char *id);
+extern const char *fmtQualifiedIdEnc(const char *schema, const char *id, int encoding);
+extern void setFmtEncoding(int encoding);
extern char *formatPGVersionNumber(int version_number, bool include_minor,
char *buf, size_t buflen);
--
2.39.5 (Apple Git-154)
From 7c56df18c1f6e48c4343f2d6d1364c5825e45278 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Mon, 10 Feb 2025 10:03:37 -0500
Subject: [PATCH 3/8] Backport upstream commit:
5dc1e42b4fa6a4434afa7d7cdcf0291351a7b873 Fix handling of invalidly encoded
data in escaping functions
Previously invalidly encoded input to various escaping functions could lead to
the escaped string getting incorrectly parsed by psql. To be safe, escaping
functions need to ensure that neither invalid nor incomplete multi-byte
characters can be used to "escape" from being quoted.
Functions which can report errors now return an error in more cases than
before. Functions that cannot report errors now replace invalid input bytes
with a byte sequence that cannot be used to escape the quotes and that is
guaranteed to error out when a query is sent to the server.
The following functions are fixed by this commit:
- PQescapeLiteral()
- PQescapeIdentifier()
- PQescapeString()
- PQescapeStringConn()
- fmtId()
- appendStringLiteral()
---
src/fe_utils/string_utils.c | 170 ++++++++++++++++++++++++++-------
src/interfaces/libpq/fe-exec.c | 114 ++++++++++++++--------
2 files changed, 212 insertions(+), 72 deletions(-)
diff --git a/src/fe_utils/string_utils.c b/src/fe_utils/string_utils.c
index 05f0bd2576d..9f7151bd542 100644
--- a/src/fe_utils/string_utils.c
+++ b/src/fe_utils/string_utils.c
@@ -106,6 +106,7 @@ fmtIdEnc(const char *rawid, int encoding)
const char *cp;
bool need_quotes = false;
+ size_t remaining = strlen(rawid);
/*
* These checks need to match the identifier production in scan.l. Don't
@@ -119,7 +120,8 @@ fmtIdEnc(const char *rawid, int encoding)
else
{
/* otherwise check the entire string */
- for (cp = rawid; *cp; cp++)
+ cp = rawid;
+ for (size_t i = 0; i < remaining; i++, cp++)
{
if (!((*cp >= 'a' && *cp <= 'z')
|| (*cp >= '0' && *cp <= '9')
@@ -155,17 +157,90 @@ fmtIdEnc(const char *rawid, int encoding)
else
{
appendPQExpBufferChar(id_return, '"');
- for (cp = rawid; *cp; cp++)
+
+ cp = &rawid[0];
+ while (remaining > 0)
{
- /*
- * Did we find a double-quote in the string? Then make this a
- * double double-quote per SQL99. Before, we put in a
- * backslash/double-quote pair. - thomas 2000-08-05
- */
- if (*cp == '"')
- appendPQExpBufferChar(id_return, '"');
- appendPQExpBufferChar(id_return, *cp);
+ int charlen;
+
+ /* Fast path for plain ASCII */
+ if (!IS_HIGHBIT_SET(*cp))
+ {
+ /*
+ * Did we find a double-quote in the string? Then make this a
+ * double double-quote per SQL99. Before, we put in a
+ * backslash/double-quote pair. - thomas 2000-08-05
+ */
+ if (*cp == '"')
+ appendPQExpBufferChar(id_return, '"');
+ appendPQExpBufferChar(id_return, *cp);
+ remaining--;
+ cp++;
+ continue;
+ }
+
+ /* Slow path for possible multibyte characters */
+ charlen = pg_encoding_mblen(encoding, cp);
+
+ if (remaining < charlen)
+ {
+ /*
+ * If the character is longer than the available input,
+ * replace the string with an invalid sequence. The invalid
+ * sequence ensures that the escaped string will trigger an
+ * error on the server-side, even if we can't directly report
+ * an error here.
+ */
+ enlargePQExpBuffer(id_return, 2);
+ pg_encoding_set_invalid(encoding,
+ id_return->data + id_return->len);
+ id_return->len += 2;
+ id_return->data[id_return->len] = '\0';
+
+ /* there's no more input data, so we can stop */
+ break;
+ }
+ else if (pg_encoding_verifymbchar(encoding, cp, charlen) == -1)
+ {
+ /*
+ * Multibyte character is invalid. It's important to verify
+ * that as invalid multi-byte characters could e.g. be used to
+ * "skip" over quote characters, e.g. when parsing
+ * character-by-character.
+ *
+ * Replace the bytes corresponding to the invalid character
+ * with an invalid sequence, for the same reason as above.
+ *
+ * It would be a bit faster to verify the whole string the
+ * first time we encounter a set highbit, but this way we can
+ * replace just the invalid characters, which probably makes
+ * it easier for users to find the invalidly encoded portion
+ * of a larger string.
+ */
+ enlargePQExpBuffer(id_return, 2);
+ pg_encoding_set_invalid(encoding,
+ id_return->data + id_return->len);
+ id_return->len += 2;
+ id_return->data[id_return->len] = '\0';
+
+ /*
+ * Copy the rest of the string after the invalid multi-byte
+ * character.
+ */
+ remaining -= charlen;
+ cp += charlen;
+ }
+ else
+ {
+ for (int i = 0; i < charlen; i++)
+ {
+ appendPQExpBufferChar(id_return, *cp);
+ remaining--;
+ cp++;
+ }
+ }
}
+
appendPQExpBufferChar(id_return, '"');
}
@@ -292,6 +367,7 @@ appendStringLiteral(PQExpBuffer buf, const char *str,
size_t length = strlen(str);
const char *source = str;
char *target;
+ size_t remaining = length;
if (!enlargePQExpBuffer(buf, 2 * length + 2))
return;
@@ -299,10 +375,10 @@ appendStringLiteral(PQExpBuffer buf, const char *str,
target = buf->data + buf->len;
*target++ = '\'';
- while (*source != '\0')
+ while (remaining > 0)
{
char c = *source;
- int len;
+ int charlen;
int i;
/* Fast path for plain ASCII */
@@ -314,39 +390,65 @@ appendStringLiteral(PQExpBuffer buf, const char *str,
/* Copy the character */
*target++ = c;
source++;
+ remaining--;
continue;
}
/* Slow path for possible multibyte characters */
- len = PQmblen(source, encoding);
+ charlen = PQmblen(source, encoding);
- /* Copy the character */
- for (i = 0; i < len; i++)
+ if (remaining < charlen)
{
- if (*source == '\0')
- break;
- *target++ = *source++;
- }
+ /*
+ * If the character is longer than the available input, replace
+ * the string with an invalid sequence. The invalid sequence
+ * ensures that the escaped string will trigger an error on the
+ * server-side, even if we can't directly report an error here.
+ *
+ * We know there's enough space for the invalid sequence because
+ * the "target" buffer is 2 * length + 2 long, and at worst we're
+ * replacing a single input byte with two invalid bytes.
+ */
+ pg_encoding_set_invalid(encoding, target);
+ target += 2;
- /*
- * If we hit premature end of string (ie, incomplete multibyte
- * character), try to pad out to the correct length with spaces. We
- * may not be able to pad completely, but we will always be able to
- * insert at least one pad space (since we'd not have quoted a
- * multibyte character). This should be enough to make a string that
- * the server will error out on.
- */
- if (i < len)
+ /* there's no more valid input data, so we can stop */
+ break;
+ }
+ else if (pg_encoding_verifymbchar(encoding, source, charlen) == -1)
{
- char *stop = buf->data + buf->maxlen - 2;
+ /*
+ * Multibyte character is invalid. It's important to verify that
+ * as invalid multi-byte characters could e.g. be used to "skip"
+ * over quote characters, e.g. when parsing
+ * character-by-character.
+ *
+ * Replace the bytes corresponding to the invalid character with
+ * an invalid sequence, for the same reason as above.
+ *
+ * It would be a bit faster to verify the whole string the first
+ * time we encounter a set highbit, but this way we can replace
+ * just the invalid characters, which probably makes it easier for
+ * users to find the invalidly encoded portion of a larger string.
+ */
+ pg_encoding_set_invalid(encoding, target);
+ target += 2;
+ remaining -= charlen;
- for (; i < len; i++)
+ /*
+ * Copy the rest of the string after the invalid multi-byte
+ * character.
+ */
+ source += charlen;
+ }
+ else
+ {
+ /* Copy the character */
+ for (i = 0; i < charlen; i++)
{
- if (target >= stop)
- break;
- *target++ = ' ';
+ *target++ = *source++;
+ remaining--;
}
- break;
}
}
diff --git a/src/interfaces/libpq/fe-exec.c b/src/interfaces/libpq/fe-exec.c
index ff101c4ca2a..97cd2c53673 100644
--- a/src/interfaces/libpq/fe-exec.c
+++ b/src/interfaces/libpq/fe-exec.c
@@ -3348,15 +3348,15 @@ PQescapeStringInternal(PGconn *conn,
{
const char *source = from;
char *target = to;
- size_t remaining = length;
+ size_t remaining = strnlen(from, length);
if (error)
*error = 0;
- while (remaining > 0 && *source != '\0')
+ while (remaining > 0)
{
char c = *source;
- int len;
+ int charlen;
int i;
/* Fast path for plain ASCII */
@@ -3373,39 +3373,48 @@ PQescapeStringInternal(PGconn *conn,
}
/* Slow path for possible multibyte characters */
- len = pg_encoding_mblen(encoding, source);
+ charlen = pg_encoding_mblen(encoding, source);
- /* Copy the character */
- for (i = 0; i < len; i++)
- {
- if (remaining == 0 || *source == '\0')
- break;
- *target++ = *source++;
- remaining--;
- }
-
- /*
- * If we hit premature end of string (ie, incomplete multibyte
- * character), try to pad out to the correct length with spaces. We
- * may not be able to pad completely, but we will always be able to
- * insert at least one pad space (since we'd not have quoted a
- * multibyte character). This should be enough to make a string that
- * the server will error out on.
- */
- if (i < len)
+ if (remaining < charlen ||
+ pg_encoding_verifymbchar(encoding, source, charlen) == -1)
{
+ /*
+ * If the character is longer than the available input, report an
+ * error if possible, and replace the string with an invalid
+ * sequence. The invalid sequence ensures that the escaped string
+ * will trigger an error on the server-side, even if we can't
+ * directly report an error here.
+ *
+ * This isn't *that* crucial when we can report an error to the
+ * caller, but if we can't, the caller will use this string
+ * unmodified and it needs to be safe for parsing.
+ *
+ * We know there's enough space for the invalid sequence because
+ * the "to" buffer needs to be at least 2 * length + 1 long, and
+ * at worst we're replacing a single input byte with two invalid
+ * bytes.
+ */
if (error)
*error = 1;
if (conn)
- printfPQExpBuffer(&conn->errorMessage,
- libpq_gettext("incomplete multibyte character\n"));
- for (; i < len; i++)
+ libpq_append_conn_error(conn, "incomplete multibyte character");
+
+ pg_encoding_set_invalid(encoding, target);
+ target += 2;
+ source++;
+ remaining--;
+
+ /* there's no more input data, so we can stop */
+ break;
+ }
+ else
+ {
+ /* Copy the character */
+ for (i = 0; i < charlen; i++)
{
- if (((size_t) (target - to)) / 2 >= length)
- break;
- *target++ = ' ';
+ *target++ = *source++;
+ remaining--;
}
- break;
}
}
@@ -3451,21 +3460,27 @@ PQescapeString(char *to, const char *from, size_t length)
static char *
PQescapeInternal(PGconn *conn, const char *str, size_t len, bool as_ident)
{
- const char *s;
+ const char *s;
char *result;
char *rp;
int num_quotes = 0; /* single or double, depending on as_ident */
int num_backslashes = 0;
- int input_len;
- int result_size;
+ size_t input_len = strnlen(str, len);
+ size_t result_size;
char quote_char = as_ident ? '"' : '\'';
+ bool validated_mb = false;
/* We must have a connection, else fail immediately. */
if (!conn)
return NULL;
- /* Scan the string for characters that must be escaped. */
- for (s = str; (s - str) < len && *s != '\0'; ++s)
+
+ /*
+ * Scan the string for characters that must be escaped and for invalidly
+ * encoded data.
+ */
+ s = str;
+ for (size_t remaining = input_len; remaining > 0; remaining--, s++)
{
if (*s == quote_char)
++num_quotes;
@@ -3478,21 +3493,42 @@ PQescapeInternal(PGconn *conn, const char *str, size_t len, bool as_ident)
/* Slow path for possible multibyte characters */
charlen = pg_encoding_mblen(conn->client_encoding, s);
- /* Multibyte character overruns allowable length. */
- if ((s - str) + charlen > len || memchr(s, 0, charlen) != NULL)
+ if (charlen > remaining)
{
printfPQExpBuffer(&conn->errorMessage,
libpq_gettext("incomplete multibyte character\n"));
return NULL;
}
+ /*
+ * If we haven't already, check that multibyte characters are
+ * valid. It's important to verify that as invalid multi-byte
+ * characters could e.g. be used to "skip" over quote characters,
+ * e.g. when parsing character-by-character.
+ *
+ * We check validity once, for the whole remainder of the string,
+ * when we first encounter any multi-byte character. Some
+ * encodings have optimized implementations for longer strings.
+ */
+ if (!validated_mb)
+ {
+ if (pg_encoding_verifymbstr(conn->client_encoding, s, remaining)
+ != remaining)
+ {
+ printfPQExpBuffer(&conn->errorMessage,
+ libpq_gettext("invalid multibyte character\n"));
+ return NULL;
+ }
+ validated_mb = true;
+ }
+
/* Adjust s, bearing in mind that for loop will increment it. */
s += charlen - 1;
+ remaining -= charlen - 1;
}
}
/* Allocate output buffer. */
- input_len = s - str;
result_size = input_len + num_quotes + 3; /* two quotes, plus a NUL */
if (!as_ident && num_backslashes > 0)
result_size += num_backslashes + 2;
@@ -3538,7 +3574,8 @@ PQescapeInternal(PGconn *conn, const char *str, size_t len, bool as_ident)
}
else
{
- for (s = str; s - str < input_len; ++s)
+ s = str;
+ for (size_t remaining = input_len; remaining > 0; remaining--, s++)
{
if (*s == quote_char || (!as_ident && *s == '\\'))
{
@@ -3556,6 +3593,7 @@ PQescapeInternal(PGconn *conn, const char *str, size_t len, bool as_ident)
*rp++ = *s;
if (--i == 0)
break;
+ remaining--;
++s; /* for loop will provide the final increment */
}
}
--
2.39.5 (Apple Git-154)
From 3751ccde18122412fcbfcc2df583cf66fefdbab0 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Mon, 10 Feb 2025 16:30:03 -0500
Subject: [PATCH 4/8] Backport upstream commit
5bf12323b6b8b05790aab6876555568898f4fc81 Adapt appendPsqlMetaConnect() to the
new fmtId() encoding expectations.
We need to tell fmtId() what encoding to assume, but this function
doesn't know that. Fortunately we can fix that without changing the
function's API, because we can just use SQL_ASCII. That's because
database names in connection requests are effectively binary not text:
no encoding-aware processing will happen on them.
This fixes XversionUpgrade failures seen in the buildfarm. The
alternative of having pg_upgrade use setFmtEncoding() is unappetizing,
given that it's connecting to multiple databases that may have
different encodings.
Andres Freund, Noah Misch, Tom Lane
Security: CVE-2025-1094
---
src/fe_utils/string_utils.c | 21 +++++++++++++++------
1 file changed, 15 insertions(+), 6 deletions(-)
diff --git a/src/fe_utils/string_utils.c b/src/fe_utils/string_utils.c
index 9f7151bd542..a289d3001eb 100644
--- a/src/fe_utils/string_utils.c
+++ b/src/fe_utils/string_utils.c
@@ -792,16 +792,22 @@ appendPsqlMetaConnect(PQExpBuffer buf, const char *dbname)
}
}
- appendPQExpBufferStr(buf, "\\connect ");
if (complex)
{
PQExpBufferData connstr;
initPQExpBuffer(&connstr);
- appendPQExpBuffer(&connstr, "dbname=");
- appendConnStrVal(&connstr, dbname);
- appendPQExpBuffer(buf, "-reuse-previous=on ");
+ /*
+ * Force the target psql's encoding to SQL_ASCII. We don't really
+ * know the encoding of the database name, and it doesn't matter as
+ * long as psql will forward it to the server unchanged.
+ */
+ appendPQExpBufferStr(buf, "\\encoding SQL_ASCII\n");
+ appendPQExpBufferStr(buf, "\\connect -reuse-previous=on ");
+
+ appendPQExpBufferStr(&connstr, "dbname=");
+ appendConnStrVal(&connstr, dbname);
/*
* As long as the name does not contain a newline, SQL identifier
@@ -809,12 +815,15 @@ appendPsqlMetaConnect(PQExpBuffer buf, const char *dbname)
* involve psql-interpreted single quotes, which behaved differently
* before PostgreSQL 9.2.
*/
- appendPQExpBufferStr(buf, fmtId(connstr.data));
+ appendPQExpBufferStr(buf, fmtIdEnc(connstr.data, PG_SQL_ASCII));
termPQExpBuffer(&connstr);
}
else
- appendPQExpBufferStr(buf, fmtId(dbname));
+ {
+ appendPQExpBufferStr(buf, "\\connect ");
+ appendPQExpBufferStr(buf, fmtIdEnc(dbname, PG_SQL_ASCII));
+ }
appendPQExpBufferChar(buf, '\n');
}
--
2.39.5 (Apple Git-154)
From 84b7b93568fa4523afb66d2d1776f5e24b5db1de Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sat, 15 Feb 2025 16:20:21 -0500
Subject: [PATCH 5/8] Backport upstream commit:
9f45e6a91d8460ac0b1f30e6ae3eefb185b8d0ab Make escaping functions retain
trailing bytes of an invalid character.
Instead of dropping the trailing byte(s) of an invalid or incomplete
multibyte character, replace only the first byte with a known-invalid
sequence, and process the rest normally. This seems less likely to
confuse incautious callers than the behavior adopted in 5dc1e42b4.
While we're at it, adjust PQescapeStringInternal to produce at most
one bleat about invalid multibyte characters per string. This
matches the behavior of PQescapeInternal, and avoids the risk of
producing tons of repetitive junk if a long string is simply given
in the wrong encoding.
This is a followup to the fixes for CVE-2025-1094, and should be
included if cherry-picking those fixes.
Author: Andres Freund <andres@anarazel.de>
Co-authored-by: Tom Lane <tgl@sss.pgh.pa.us>
Reported-by: Jeff Davis <pgsql@j-davis.com>
Discussion: https://postgr.es/m/20250215012712.45@rfd.leadboat.com
---
src/fe_utils/string_utils.c | 91 +++++++++++++---------------------
src/interfaces/libpq/fe-exec.c | 22 ++++----
2 files changed, 47 insertions(+), 66 deletions(-)
diff --git a/src/fe_utils/string_utils.c b/src/fe_utils/string_utils.c
index a289d3001eb..a2d5ccd1e28 100644
--- a/src/fe_utils/string_utils.c
+++ b/src/fe_utils/string_utils.c
@@ -182,40 +182,25 @@ fmtIdEnc(const char *rawid, int encoding)
/* Slow path for possible multibyte characters */
charlen = pg_encoding_mblen(encoding, cp);
- if (remaining < charlen)
- {
- /*
- * If the character is longer than the available input,
- * replace the string with an invalid sequence. The invalid
- * sequence ensures that the escaped string will trigger an
- * error on the server-side, even if we can't directly report
- * an error here.
- */
- enlargePQExpBuffer(id_return, 2);
- pg_encoding_set_invalid(encoding,
- id_return->data + id_return->len);
- id_return->len += 2;
- id_return->data[id_return->len] = '\0';
-
- /* there's no more input data, so we can stop */
- break;
- }
- else if (pg_encoding_verifymbchar(encoding, cp, charlen) == -1)
+ if (remaining < charlen ||
+ pg_encoding_verifymbchar(encoding, cp, charlen) == -1)
{
/*
* Multibyte character is invalid. It's important to verify
- * that as invalid multi-byte characters could e.g. be used to
+ * that as invalid multibyte characters could e.g. be used to
* "skip" over quote characters, e.g. when parsing
* character-by-character.
*
- * Replace the bytes corresponding to the invalid character
- * with an invalid sequence, for the same reason as above.
+ * Replace the character's first byte with an invalid
+ * sequence. The invalid sequence ensures that the escaped
+ * string will trigger an error on the server-side, even if we
+ * can't directly report an error here.
*
* It would be a bit faster to verify the whole string the
* first time we encounter a set highbit, but this way we can
- * replace just the invalid characters, which probably makes
- * it easier for users to find the invalidly encoded portion
- * of a larger string.
+ * replace just the invalid data, which probably makes it
+ * easier for users to find the invalidly encoded portion of a
+ * larger string.
*/
enlargePQExpBuffer(id_return, 2);
pg_encoding_set_invalid(encoding,
@@ -224,11 +209,13 @@ fmtIdEnc(const char *rawid, int encoding)
id_return->data[id_return->len] = '\0';
/*
- * Copy the rest of the string after the invalid multi-byte
- * character.
+ * Handle the following bytes as if this byte didn't exist.
+ * That's safer in case the subsequent bytes contain
+ * characters that are significant for the caller (e.g. '>' in
+ * html).
*/
- remaining -= charlen;
- cp += charlen;
+ remaining--;
+ cp++;
}
else
{
@@ -397,49 +384,39 @@ appendStringLiteral(PQExpBuffer buf, const char *str,
/* Slow path for possible multibyte characters */
charlen = PQmblen(source, encoding);
- if (remaining < charlen)
- {
- /*
- * If the character is longer than the available input, replace
- * the string with an invalid sequence. The invalid sequence
- * ensures that the escaped string will trigger an error on the
- * server-side, even if we can't directly report an error here.
- *
- * We know there's enough space for the invalid sequence because
- * the "target" buffer is 2 * length + 2 long, and at worst we're
- * replacing a single input byte with two invalid bytes.
- */
- pg_encoding_set_invalid(encoding, target);
- target += 2;
-
- /* there's no more valid input data, so we can stop */
- break;
- }
- else if (pg_encoding_verifymbchar(encoding, source, charlen) == -1)
+ if (remaining < charlen ||
+ pg_encoding_verifymbchar(encoding, source, charlen) == -1)
{
/*
* Multibyte character is invalid. It's important to verify that
- * as invalid multi-byte characters could e.g. be used to "skip"
+ * as invalid multibyte characters could e.g. be used to "skip"
* over quote characters, e.g. when parsing
* character-by-character.
*
- * Replace the bytes corresponding to the invalid character with
- * an invalid sequence, for the same reason as above.
+ * Replace the character's first byte with an invalid sequence.
+ * The invalid sequence ensures that the escaped string will
+ * trigger an error on the server-side, even if we can't directly
+ * report an error here.
+ *
+ * We know there's enough space for the invalid sequence because
+ * the "target" buffer is 2 * length + 2 long, and at worst we're
+ * replacing a single input byte with two invalid bytes.
*
* It would be a bit faster to verify the whole string the first
* time we encounter a set highbit, but this way we can replace
- * just the invalid characters, which probably makes it easier for
- * users to find the invalidly encoded portion of a larger string.
+ * just the invalid data, which probably makes it easier for users
+ * to find the invalidly encoded portion of a larger string.
*/
pg_encoding_set_invalid(encoding, target);
target += 2;
- remaining -= charlen;
/*
- * Copy the rest of the string after the invalid multi-byte
- * character.
+ * Handle the following bytes as if this byte didn't exist. That's
+ * safer in case the subsequent bytes contain important characters
+ * for the caller (e.g. '>' in html).
*/
- source += charlen;
+ source++;
+ remaining--;
}
else
{
diff --git a/src/interfaces/libpq/fe-exec.c b/src/interfaces/libpq/fe-exec.c
index 97cd2c53673..a29d19a6268 100644
--- a/src/interfaces/libpq/fe-exec.c
+++ b/src/interfaces/libpq/fe-exec.c
@@ -3349,6 +3349,7 @@ PQescapeStringInternal(PGconn *conn,
const char *source = from;
char *target = to;
size_t remaining = strnlen(from, length);
+ bool already_complained = false;
if (error)
*error = 0;
@@ -3379,15 +3380,20 @@ PQescapeStringInternal(PGconn *conn,
pg_encoding_verifymbchar(encoding, source, charlen) == -1)
{
/*
- * If the character is longer than the available input, report an
- * error if possible, and replace the string with an invalid
- * sequence. The invalid sequence ensures that the escaped string
- * will trigger an error on the server-side, even if we can't
- * directly report an error here.
+ * Multibyte character is invalid. It's important to verify that
+ * as invalid multibyte characters could e.g. be used to "skip"
+ * over quote characters, e.g. when parsing
+ * character-by-character.
+ *
+ * Report an error if possible, and replace the character's first
+ * byte with an invalid sequence. The invalid sequence ensures
+ * that the escaped string will trigger an error on the
+ * server-side, even if we can't directly report an error here.
*
* This isn't *that* crucial when we can report an error to the
- * caller, but if we can't, the caller will use this string
- * unmodified and it needs to be safe for parsing.
+ * caller; but if we can't or the caller ignores it, the caller
+ * will use this string unmodified and it needs to be safe for
+ * parsing.
*
* We know there's enough space for the invalid sequence because
* the "to" buffer needs to be at least 2 * length + 1 long, and
@@ -3404,8 +3410,6 @@ PQescapeStringInternal(PGconn *conn,
source++;
remaining--;
- /* there's no more input data, so we can stop */
- break;
}
else
{
--
2.39.5 (Apple Git-154)
From 21118244dad366d20e1d11549df03dd56e76dbaa Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sun, 16 Feb 2025 12:46:35 -0500
Subject: [PATCH 6/8] In fmtIdEnc(), handle failure of enlargePQExpBuffer().
Coverity complained that we weren't doing that, and it's right.
This fix just makes fmtIdEnc() honor the general convention that OOM
causes a PQExpBuffer to become marked "broken", without any immediate
error. In the pretty-unlikely case that we actually did hit OOM here,
the end result would be to return an empty string to the caller,
probably resulting in invalid SQL syntax in an issued command (if
nothing else went wrong, which is even more unlikely). It's tempting
to throw an "out of memory" error if the buffer becomes broken, but
there's not a lot of point in doing that only here and not in hundreds
of other PQExpBuffer-using places in pg_dump and similar callers.
The whole issue could do with some non-time-crunched redesign, perhaps.
This is a followup to the fixes for CVE-2025-1094, and should be
included if cherry-picking those fixes.
---
src/fe_utils/string_utils.c | 12 +++++++-----
1 file changed, 7 insertions(+), 5 deletions(-)
diff --git a/src/fe_utils/string_utils.c b/src/fe_utils/string_utils.c
index a2d5ccd1e28..fe280df3c0f 100644
--- a/src/fe_utils/string_utils.c
+++ b/src/fe_utils/string_utils.c
@@ -202,11 +202,13 @@ fmtIdEnc(const char *rawid, int encoding)
* easier for users to find the invalidly encoded portion of a
* larger string.
*/
- enlargePQExpBuffer(id_return, 2);
- pg_encoding_set_invalid(encoding,
- id_return->data + id_return->len);
- id_return->len += 2;
- id_return->data[id_return->len] = '\0';
+ if (enlargePQExpBuffer(id_return, 2))
+ {
+ pg_encoding_set_invalid(encoding,
+ id_return->data + id_return->len);
+ id_return->len += 2;
+ id_return->data[id_return->len] = '\0';
+ }
/*
* Handle the following bytes as if this byte didn't exist.
--
2.39.5 (Apple Git-154)
From 6f42371a3c3911299c081afe3478022c496b07a9 Mon Sep 17 00:00:00 2001
From: Filip Janus <fjanus@redhat.com>
Date: Mon, 17 Mar 2025 18:14:05 +0100
Subject: [PATCH 7/8] Backport multiple changes from postgresql13, especially
wchar.c functionality from backend was moved to common directory, it means
that functionaity can be used by server but also by libpq. Due to the
necessary changes there are couple of "reverts" from previous commits in
src/backend/utils/mb/wchar.c but it's expected because now it's linked with
implementation from common/wchar.c instead src/backend/utils/mb/wchar.c
---
src/backend/utils/mb/wchar.c | 101 +-
src/bin/scripts/dropdb.c | 5 +-
src/common/Makefile | 2 +-
src/common/wchar.c | 1728 ++++++++++++++++++
src/include/common/unicode_combining_table.h | 196 ++
src/include/mb/pg_wchar.h | 4 +
src/interfaces/libpq/fe-exec.c | 7 +-
7 files changed, 1958 insertions(+), 85 deletions(-)
create mode 100644 src/common/wchar.c
create mode 100644 src/include/common/unicode_combining_table.h
diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c
index 872241cc804..1ca6094d2a3 100644
--- a/src/backend/utils/mb/wchar.c
+++ b/src/backend/utils/mb/wchar.c
@@ -14,25 +14,6 @@
#include "mb/pg_wchar.h"
-/*
- * In today's multibyte encodings other than UTF8, this two-byte sequence
- * ensures pg_encoding_mblen() == 2 && pg_encoding_verifymbstr() == 0.
- *
- * For historical reasons, several verifychar implementations opt to reject
- * this pair specifically. Byte pair range constraints, in encoding
- * originator documentation, always excluded this pair. No core conversion
- * could translate it. However, longstanding verifychar implementations
- * accepted any non-NUL byte. big5_to_euc_tw and big5_to_mic even translate
- * pairs not valid per encoding originator documentation. To avoid tightening
- * core or non-core conversions in a security patch, we sought this one pair.
- *
- * PQescapeString() historically used spaces for BYTE1; many other values
- * could suffice for BYTE1.
- */
-#define NONUTF8_INVALID_BYTE0 (0x8d)
-#define NONUTF8_INVALID_BYTE1 (' ')
-
-
/*
* Operations on multi-byte encodings are driven by a table of helper
* functions.
@@ -496,7 +477,7 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
/*
* Map a Unicode code point to UTF-8. utf8string must have 4 bytes of
* space allocated.
- */
+ *
unsigned char *
unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
{
@@ -525,7 +506,7 @@ unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
return utf8string;
}
-
+*/
/*
* Trivial conversion from pg_wchar to UTF-8.
* caller should allocate enough space for "to"
@@ -562,7 +543,7 @@ pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
*
* pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
* other places would need to be fixed to change this.
- */
+ *
int
pg_utf_mblen(const unsigned char *s)
{
@@ -586,7 +567,7 @@ pg_utf_mblen(const unsigned char *s)
len = 1;
return len;
}
-
+*/
/*
* This is an implementation of wcwidth() and wcswidth() as defined in
* "The Single UNIX Specification, Version 2, The Open Group, 1997"
@@ -765,7 +746,7 @@ ucs_wcwidth(pg_wchar ucs)
* This is a one-character version of pg_utf2wchar_with_len.
*
* No error checks here, c must point to a long-enough string.
- */
+ *
pg_wchar
utf8_to_unicode(const unsigned char *c)
{
@@ -784,10 +765,10 @@ utf8_to_unicode(const unsigned char *c)
((c[2] & 0x3f) << 6) |
(c[3] & 0x3f));
else
- /* that is an invalid code on purpose */
+ // that is an invalid code on purpose
return 0xffffffff;
}
-
+*/
static int
pg_utf_dsplen(const unsigned char *s)
{
@@ -917,7 +898,7 @@ pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
*to = 0;
return cnt;
}
-
+/*
int
pg_mule_mblen(const unsigned char *s)
{
@@ -932,9 +913,9 @@ pg_mule_mblen(const unsigned char *s)
else if (IS_LCPRV2(*s))
len = 4;
else
- len = 1; /* assume ASCII */
+ len = 1;
return len;
-}
+}*/
static int
pg_mule_dsplen(const unsigned char *s)
@@ -1413,11 +1394,6 @@ pg_big5_verifier(const unsigned char *s, int len)
if (len < l)
return -1;
- if (l == 2 &&
- s[0] == NONUTF8_INVALID_BYTE0 &&
- s[1] == NONUTF8_INVALID_BYTE1)
- return -1;
-
while (--l > 0)
{
if (*++s == '\0')
@@ -1438,11 +1414,6 @@ pg_gbk_verifier(const unsigned char *s, int len)
if (len < l)
return -1;
- if (l == 2 &&
- s[0] == NONUTF8_INVALID_BYTE0 &&
- s[1] == NONUTF8_INVALID_BYTE1)
- return -1;
-
while (--l > 0)
{
if (*++s == '\0')
@@ -1463,11 +1434,6 @@ pg_uhc_verifier(const unsigned char *s, int len)
if (len < l)
return -1;
- if (l == 2 &&
- s[0] == NONUTF8_INVALID_BYTE0 &&
- s[1] == NONUTF8_INVALID_BYTE1)
- return -1;
-
while (--l > 0)
{
if (*++s == '\0')
@@ -1535,7 +1501,7 @@ pg_utf8_verifier(const unsigned char *s, int len)
*
* length is assumed to have been obtained by pg_utf_mblen(), and the
* caller must have checked that that many bytes are present in the buffer.
- */
+ *
bool
pg_utf8_islegal(const unsigned char *source, int length)
{
@@ -1544,18 +1510,15 @@ pg_utf8_islegal(const unsigned char *source, int length)
switch (length)
{
default:
- /* reject lengths 5 and 6 for now */
return false;
case 4:
a = source[3];
if (a < 0x80 || a > 0xBF)
return false;
- /* FALL THRU */
case 3:
a = source[2];
if (a < 0x80 || a > 0xBF)
return false;
- /* FALL THRU */
case 2:
a = source[1];
switch (*source)
@@ -1581,7 +1544,6 @@ pg_utf8_islegal(const unsigned char *source, int length)
return false;
break;
}
- /* FALL THRU */
case 1:
a = *source;
if (a >= 0x80 && a < 0xC2)
@@ -1592,7 +1554,7 @@ pg_utf8_islegal(const unsigned char *source, int length)
}
return true;
}
-
+*/
#ifndef FRONTEND
/*
@@ -1802,26 +1764,13 @@ pg_eucjp_increment(unsigned char *charptr, int length)
#endif /* !FRONTEND */
-/*
- * Fills the provided buffer with two bytes such that:
- * pg_encoding_mblen(dst) == 2 && pg_encoding_verifymbstr(dst) == 0
- */
-void
-pg_encoding_set_invalid(int encoding, char *dst)
-{
- Assert(pg_encoding_max_length(encoding) > 1);
-
- dst[0] = (encoding == PG_UTF8 ? 0xc0 : NONUTF8_INVALID_BYTE0);
- dst[1] = NONUTF8_INVALID_BYTE1;
-}
-
/*
*-------------------------------------------------------------------
* encoding info table
* XXX must be sorted by the same order as enum pg_enc (in mb/pg_wchar.h)
*-------------------------------------------------------------------
*/
-const pg_wchar_tbl pg_wchar_table[] = {
+const pg_wchar_tbl pg_wchar_table1[] = {
{pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifier, 1}, /* PG_SQL_ASCII */
{pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3}, /* PG_EUC_JP */
{pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifier, 2}, /* PG_EUC_CN */
@@ -1875,7 +1824,7 @@ pg_mic_mblen(const unsigned char *mbstr)
/*
* Returns the byte length of a multibyte character.
- */
+ *
int
pg_encoding_mblen(int encoding, const char *mbstr)
{
@@ -1883,10 +1832,10 @@ pg_encoding_mblen(int encoding, const char *mbstr)
pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
}
-
+*/
/*
* Returns the display length of a multibyte character.
- */
+ *
int
pg_encoding_dsplen(int encoding, const char *mbstr)
{
@@ -1894,12 +1843,12 @@ pg_encoding_dsplen(int encoding, const char *mbstr)
pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
}
-
+*/
/*
* Verify the first multibyte character of the given string.
* Return its byte length if good, -1 if bad. (See comments above for
* full details of the mbverify API.)
- */
+ *
int
pg_encoding_verifymb(int encoding, const char *mbstr, int len)
{
@@ -1907,24 +1856,18 @@ pg_encoding_verifymb(int encoding, const char *mbstr, int len)
pg_wchar_table[encoding].mbverify((const unsigned char *) mbstr, len) :
pg_wchar_table[PG_SQL_ASCII].mbverify((const unsigned char *) mbstr, len));
}
-
+*/
/*
* fetch maximum length of a given encoding
- */
+ *
int
pg_encoding_max_length(int encoding)
{
Assert(PG_VALID_ENCODING(encoding));
- /*
- * Check for the encoding despite the assert, due to some mingw versions
- * otherwise issuing bogus warnings.
- */
- return PG_VALID_ENCODING(encoding) ?
- pg_wchar_table[encoding].maxmblen :
- pg_wchar_table[PG_SQL_ASCII].maxmblen;
+ return pg_wchar_table[encoding].maxmblen;
}
-
+*/
#ifndef FRONTEND
/*
diff --git a/src/bin/scripts/dropdb.c b/src/bin/scripts/dropdb.c
index 0d636d0ef46..ed3a2c8c19a 100644
--- a/src/bin/scripts/dropdb.c
+++ b/src/bin/scripts/dropdb.c
@@ -141,10 +141,9 @@ main(int argc, char *argv[])
conn = connectMaintenanceDatabase(&cparams, progname, echo);
initPQExpBuffer(&sql);
- appendPQExpBuffer(&sql, "DROP DATABASE %s%s%s;",
+ appendPQExpBuffer(&sql, "DROP DATABASE %s%s;",
(if_exists ? "IF EXISTS " : ""),
- fmtIdEnc(dbname, PQclientEncoding(conn)),
- force ? " WITH (FORCE)" : "");
+ fmtIdEnc(dbname, PQclientEncoding(conn)));
if (echo)
printf("%s\n", sql.data);
diff --git a/src/common/Makefile b/src/common/Makefile
index 2f22b9b101d..c26d938b31e 100644
--- a/src/common/Makefile
+++ b/src/common/Makefile
@@ -50,7 +50,7 @@ OBJS_COMMON = base64.o config_info.o controldata_utils.o d2s.o exec.o f2s.o \
file_perm.o ip.o keywords.o kwlookup.o link-canary.o md5.o \
pg_lzcompress.o pgfnames.o psprintf.o relpath.o \
rmtree.o saslprep.o scram-common.o string.o unicode_norm.o \
- username.o wait_error.o
+ username.o wait_error.o wchar.o
ifeq ($(with_openssl),yes)
OBJS_COMMON += sha2_openssl.o
diff --git a/src/common/wchar.c b/src/common/wchar.c
new file mode 100644
index 00000000000..85822b2c3b5
--- /dev/null
+++ b/src/common/wchar.c
@@ -0,0 +1,1728 @@
+/*-------------------------------------------------------------------------
+ *
+ * wchar.c
+ * Functions for working with multibyte characters in various encodings.
+ *
+ * Portions Copyright (c) 1998-2020, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/common/wchar.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+
+#include "mb/pg_wchar.h"
+
+
+/*
+ * In today's multibyte encodings other than UTF8, this two-byte sequence
+ * ensures pg_encoding_mblen() == 2 && pg_encoding_verifymbstr() == 0.
+ *
+ * For historical reasons, several verifychar implementations opt to reject
+ * this pair specifically. Byte pair range constraints, in encoding
+ * originator documentation, always excluded this pair. No core conversion
+ * could translate it. However, longstanding verifychar implementations
+ * accepted any non-NUL byte. big5_to_euc_tw and big5_to_mic even translate
+ * pairs not valid per encoding originator documentation. To avoid tightening
+ * core or non-core conversions in a security patch, we sought this one pair.
+ *
+ * PQescapeString() historically used spaces for BYTE1; many other values
+ * could suffice for BYTE1.
+ */
+#define NONUTF8_INVALID_BYTE0 (0x8d)
+#define NONUTF8_INVALID_BYTE1 (' ')
+
+
+/*
+ * Operations on multi-byte encodings are driven by a table of helper
+ * functions.
+ *
+ * To add an encoding support, define mblen(), dsplen() and verifier() for
+ * the encoding. For server-encodings, also define mb2wchar() and wchar2mb()
+ * conversion functions.
+ *
+ * These functions generally assume that their input is validly formed.
+ * The "verifier" functions, further down in the file, have to be more
+ * paranoid.
+ *
+ * We expect that mblen() does not need to examine more than the first byte
+ * of the character to discover the correct length. GB18030 is an exception
+ * to that rule, though, as it also looks at second byte. But even that
+ * behaves in a predictable way, if you only pass the first byte: it will
+ * treat 4-byte encoded characters as two 2-byte encoded characters, which is
+ * good enough for all current uses.
+ *
+ * Note: for the display output of psql to work properly, the return values
+ * of the dsplen functions must conform to the Unicode standard. In particular
+ * the NUL character is zero width and control characters are generally
+ * width -1. It is recommended that non-ASCII encodings refer their ASCII
+ * subset to the ASCII routines to ensure consistency.
+ */
+
+/*
+ * SQL/ASCII
+ */
+static int
+pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
+{
+ int cnt = 0;
+
+ while (len > 0 && *from)
+ {
+ *to++ = *from++;
+ len--;
+ cnt++;
+ }
+ *to = 0;
+ return cnt;
+}
+
+static int
+pg_ascii_mblen(const unsigned char *s)
+{
+ return 1;
+}
+
+static int
+pg_ascii_dsplen(const unsigned char *s)
+{
+ if (*s == '\0')
+ return 0;
+ if (*s < 0x20 || *s == 0x7f)
+ return -1;
+
+ return 1;
+}
+
+/*
+ * EUC
+ */
+static int
+pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
+{
+ int cnt = 0;
+
+ while (len > 0 && *from)
+ {
+ if (*from == SS2 && len >= 2) /* JIS X 0201 (so called "1 byte
+ * KANA") */
+ {
+ from++;
+ *to = (SS2 << 8) | *from++;
+ len -= 2;
+ }
+ else if (*from == SS3 && len >= 3) /* JIS X 0212 KANJI */
+ {
+ from++;
+ *to = (SS3 << 16) | (*from++ << 8);
+ *to |= *from++;
+ len -= 3;
+ }
+ else if (IS_HIGHBIT_SET(*from) && len >= 2) /* JIS X 0208 KANJI */
+ {
+ *to = *from++ << 8;
+ *to |= *from++;
+ len -= 2;
+ }
+ else /* must be ASCII */
+ {
+ *to = *from++;
+ len--;
+ }
+ to++;
+ cnt++;
+ }
+ *to = 0;
+ return cnt;
+}
+
+static inline int
+pg_euc_mblen(const unsigned char *s)
+{
+ int len;
+
+ if (*s == SS2)
+ len = 2;
+ else if (*s == SS3)
+ len = 3;
+ else if (IS_HIGHBIT_SET(*s))
+ len = 2;
+ else
+ len = 1;
+ return len;
+}
+
+static inline int
+pg_euc_dsplen(const unsigned char *s)
+{
+ int len;
+
+ if (*s == SS2)
+ len = 2;
+ else if (*s == SS3)
+ len = 2;
+ else if (IS_HIGHBIT_SET(*s))
+ len = 2;
+ else
+ len = pg_ascii_dsplen(s);
+ return len;
+}
+
+/*
+ * EUC_JP
+ */
+static int
+pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
+{
+ return pg_euc2wchar_with_len(from, to, len);
+}
+
+static int
+pg_eucjp_mblen(const unsigned char *s)
+{
+ return pg_euc_mblen(s);
+}
+
+static int
+pg_eucjp_dsplen(const unsigned char *s)
+{
+ int len;
+
+ if (*s == SS2)
+ len = 1;
+ else if (*s == SS3)
+ len = 2;
+ else if (IS_HIGHBIT_SET(*s))
+ len = 2;
+ else
+ len = pg_ascii_dsplen(s);
+ return len;
+}
+
+/*
+ * EUC_KR
+ */
+static int
+pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
+{
+ return pg_euc2wchar_with_len(from, to, len);
+}
+
+static int
+pg_euckr_mblen(const unsigned char *s)
+{
+ return pg_euc_mblen(s);
+}
+
+static int
+pg_euckr_dsplen(const unsigned char *s)
+{
+ return pg_euc_dsplen(s);
+}
+
+/*
+ * EUC_CN
+ *
+ */
+static int
+pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
+{
+ int cnt = 0;
+
+ while (len > 0 && *from)
+ {
+ if (*from == SS2 && len >= 3) /* code set 2 (unused?) */
+ {
+ from++;
+ *to = (SS2 << 16) | (*from++ << 8);
+ *to |= *from++;
+ len -= 3;
+ }
+ else if (*from == SS3 && len >= 3) /* code set 3 (unused ?) */
+ {
+ from++;
+ *to = (SS3 << 16) | (*from++ << 8);
+ *to |= *from++;
+ len -= 3;
+ }
+ else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 1 */
+ {
+ *to = *from++ << 8;
+ *to |= *from++;
+ len -= 2;
+ }
+ else
+ {
+ *to = *from++;
+ len--;
+ }
+ to++;
+ cnt++;
+ }
+ *to = 0;
+ return cnt;
+}
+
+static int
+pg_euccn_mblen(const unsigned char *s)
+{
+ int len;
+
+ if (IS_HIGHBIT_SET(*s))
+ len = 2;
+ else
+ len = 1;
+ return len;
+}
+
+static int
+pg_euccn_dsplen(const unsigned char *s)
+{
+ int len;
+
+ if (IS_HIGHBIT_SET(*s))
+ len = 2;
+ else
+ len = pg_ascii_dsplen(s);
+ return len;
+}
+
+/*
+ * EUC_TW
+ *
+ */
+static int
+pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
+{
+ int cnt = 0;
+
+ while (len > 0 && *from)
+ {
+ if (*from == SS2 && len >= 4) /* code set 2 */
+ {
+ from++;
+ *to = (((uint32) SS2) << 24) | (*from++ << 16);
+ *to |= *from++ << 8;
+ *to |= *from++;
+ len -= 4;
+ }
+ else if (*from == SS3 && len >= 3) /* code set 3 (unused?) */
+ {
+ from++;
+ *to = (SS3 << 16) | (*from++ << 8);
+ *to |= *from++;
+ len -= 3;
+ }
+ else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 2 */
+ {
+ *to = *from++ << 8;
+ *to |= *from++;
+ len -= 2;
+ }
+ else
+ {
+ *to = *from++;
+ len--;
+ }
+ to++;
+ cnt++;
+ }
+ *to = 0;
+ return cnt;
+}
+
+static int
+pg_euctw_mblen(const unsigned char *s)
+{
+ int len;
+
+ if (*s == SS2)
+ len = 4;
+ else if (*s == SS3)
+ len = 3;
+ else if (IS_HIGHBIT_SET(*s))
+ len = 2;
+ else
+ len = 1;
+ return len;
+}
+
+static int
+pg_euctw_dsplen(const unsigned char *s)
+{
+ int len;
+
+ if (*s == SS2)
+ len = 2;
+ else if (*s == SS3)
+ len = 2;
+ else if (IS_HIGHBIT_SET(*s))
+ len = 2;
+ else
+ len = pg_ascii_dsplen(s);
+ return len;
+}
+
+/*
+ * Convert pg_wchar to EUC_* encoding.
+ * caller must allocate enough space for "to", including a trailing zero!
+ * len: length of from.
+ * "from" not necessarily null terminated.
+ */
+static int
+pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
+{
+ int cnt = 0;
+
+ while (len > 0 && *from)
+ {
+ unsigned char c;
+
+ if ((c = (*from >> 24)))
+ {
+ *to++ = c;
+ *to++ = (*from >> 16) & 0xff;
+ *to++ = (*from >> 8) & 0xff;
+ *to++ = *from & 0xff;
+ cnt += 4;
+ }
+ else if ((c = (*from >> 16)))
+ {
+ *to++ = c;
+ *to++ = (*from >> 8) & 0xff;
+ *to++ = *from & 0xff;
+ cnt += 3;
+ }
+ else if ((c = (*from >> 8)))
+ {
+ *to++ = c;
+ *to++ = *from & 0xff;
+ cnt += 2;
+ }
+ else
+ {
+ *to++ = *from;
+ cnt++;
+ }
+ from++;
+ len--;
+ }
+ *to = 0;
+ return cnt;
+}
+
+
+/*
+ * JOHAB
+ */
+static int
+pg_johab_mblen(const unsigned char *s)
+{
+ return pg_euc_mblen(s);
+}
+
+static int
+pg_johab_dsplen(const unsigned char *s)
+{
+ return pg_euc_dsplen(s);
+}
+
+/*
+ * convert UTF8 string to pg_wchar (UCS-4)
+ * caller must allocate enough space for "to", including a trailing zero!
+ * len: length of from.
+ * "from" not necessarily null terminated.
+ */
+static int
+pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
+{
+ int cnt = 0;
+ uint32 c1,
+ c2,
+ c3,
+ c4;
+
+ while (len > 0 && *from)
+ {
+ if ((*from & 0x80) == 0)
+ {
+ *to = *from++;
+ len--;
+ }
+ else if ((*from & 0xe0) == 0xc0)
+ {
+ if (len < 2)
+ break; /* drop trailing incomplete char */
+ c1 = *from++ & 0x1f;
+ c2 = *from++ & 0x3f;
+ *to = (c1 << 6) | c2;
+ len -= 2;
+ }
+ else if ((*from & 0xf0) == 0xe0)
+ {
+ if (len < 3)
+ break; /* drop trailing incomplete char */
+ c1 = *from++ & 0x0f;
+ c2 = *from++ & 0x3f;
+ c3 = *from++ & 0x3f;
+ *to = (c1 << 12) | (c2 << 6) | c3;
+ len -= 3;
+ }
+ else if ((*from & 0xf8) == 0xf0)
+ {
+ if (len < 4)
+ break; /* drop trailing incomplete char */
+ c1 = *from++ & 0x07;
+ c2 = *from++ & 0x3f;
+ c3 = *from++ & 0x3f;
+ c4 = *from++ & 0x3f;
+ *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
+ len -= 4;
+ }
+ else
+ {
+ /* treat a bogus char as length 1; not ours to raise error */
+ *to = *from++;
+ len--;
+ }
+ to++;
+ cnt++;
+ }
+ *to = 0;
+ return cnt;
+}
+
+
+/*
+ * Map a Unicode code point to UTF-8. utf8string must have 4 bytes of
+ * space allocated.
+ */
+unsigned char *
+unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
+{
+ if (c <= 0x7F)
+ {
+ utf8string[0] = c;
+ }
+ else if (c <= 0x7FF)
+ {
+ utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);
+ utf8string[1] = 0x80 | (c & 0x3F);
+ }
+ else if (c <= 0xFFFF)
+ {
+ utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);
+ utf8string[1] = 0x80 | ((c >> 6) & 0x3F);
+ utf8string[2] = 0x80 | (c & 0x3F);
+ }
+ else
+ {
+ utf8string[0] = 0xF0 | ((c >> 18) & 0x07);
+ utf8string[1] = 0x80 | ((c >> 12) & 0x3F);
+ utf8string[2] = 0x80 | ((c >> 6) & 0x3F);
+ utf8string[3] = 0x80 | (c & 0x3F);
+ }
+
+ return utf8string;
+}
+
+/*
+ * Trivial conversion from pg_wchar to UTF-8.
+ * caller should allocate enough space for "to"
+ * len: length of from.
+ * "from" not necessarily null terminated.
+ */
+static int
+pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
+{
+ int cnt = 0;
+
+ while (len > 0 && *from)
+ {
+ int char_len;
+
+ unicode_to_utf8(*from, to);
+ char_len = pg_utf_mblen(to);
+ cnt += char_len;
+ to += char_len;
+ from++;
+ len--;
+ }
+ *to = 0;
+ return cnt;
+}
+
+/*
+ * Return the byte length of a UTF8 character pointed to by s
+ *
+ * Note: in the current implementation we do not support UTF8 sequences
+ * of more than 4 bytes; hence do NOT return a value larger than 4.
+ * We return "1" for any leading byte that is either flat-out illegal or
+ * indicates a length larger than we support.
+ *
+ * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
+ * other places would need to be fixed to change this.
+ */
+int
+pg_utf_mblen(const unsigned char *s)
+{
+ int len;
+
+ if ((*s & 0x80) == 0)
+ len = 1;
+ else if ((*s & 0xe0) == 0xc0)
+ len = 2;
+ else if ((*s & 0xf0) == 0xe0)
+ len = 3;
+ else if ((*s & 0xf8) == 0xf0)
+ len = 4;
+#ifdef NOT_USED
+ else if ((*s & 0xfc) == 0xf8)
+ len = 5;
+ else if ((*s & 0xfe) == 0xfc)
+ len = 6;
+#endif
+ else
+ len = 1;
+ return len;
+}
+
+/*
+ * This is an implementation of wcwidth() and wcswidth() as defined in
+ * "The Single UNIX Specification, Version 2, The Open Group, 1997"
+ * <http://www.unix.org/online.html>
+ *
+ * Markus Kuhn -- 2001-09-08 -- public domain
+ *
+ * customised for PostgreSQL
+ *
+ * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
+ */
+
+struct mbinterval
+{
+ unsigned short first;
+ unsigned short last;
+};
+
+/* auxiliary function for binary search in interval table */
+static int
+mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
+{
+ int min = 0;
+ int mid;
+
+ if (ucs < table[0].first || ucs > table[max].last)
+ return 0;
+ while (max >= min)
+ {
+ mid = (min + max) / 2;
+ if (ucs > table[mid].last)
+ min = mid + 1;
+ else if (ucs < table[mid].first)
+ max = mid - 1;
+ else
+ return 1;
+ }
+
+ return 0;
+}
+
+
+/* The following functions define the column width of an ISO 10646
+ * character as follows:
+ *
+ * - The null character (U+0000) has a column width of 0.
+ *
+ * - Other C0/C1 control characters and DEL will lead to a return
+ * value of -1.
+ *
+ * - Non-spacing and enclosing combining characters (general
+ * category code Mn or Me in the Unicode database) have a
+ * column width of 0.
+ *
+ * - Other format characters (general category code Cf in the Unicode
+ * database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
+ *
+ * - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
+ * have a column width of 0.
+ *
+ * - Spacing characters in the East Asian Wide (W) or East Asian
+ * FullWidth (F) category as defined in Unicode Technical
+ * Report #11 have a column width of 2.
+ *
+ * - All remaining characters (including all printable
+ * ISO 8859-1 and WGL4 characters, Unicode control characters,
+ * etc.) have a column width of 1.
+ *
+ * This implementation assumes that wchar_t characters are encoded
+ * in ISO 10646.
+ */
+
+static int
+ucs_wcwidth(pg_wchar ucs)
+{
+#include "common/unicode_combining_table.h"
+
+ /* test for 8-bit control characters */
+ if (ucs == 0)
+ return 0;
+
+ if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
+ return -1;
+
+ /* binary search in table of non-spacing characters */
+ if (mbbisearch(ucs, combining,
+ sizeof(combining) / sizeof(struct mbinterval) - 1))
+ return 0;
+
+ /*
+ * if we arrive here, ucs is not a combining or C0/C1 control character
+ */
+
+ return 1 +
+ (ucs >= 0x1100 &&
+ (ucs <= 0x115f || /* Hangul Jamo init. consonants */
+ (ucs >= 0x2e80 && ucs <= 0xa4cf && (ucs & ~0x0011) != 0x300a &&
+ ucs != 0x303f) || /* CJK ... Yi */
+ (ucs >= 0xac00 && ucs <= 0xd7a3) || /* Hangul Syllables */
+ (ucs >= 0xf900 && ucs <= 0xfaff) || /* CJK Compatibility
+ * Ideographs */
+ (ucs >= 0xfe30 && ucs <= 0xfe6f) || /* CJK Compatibility Forms */
+ (ucs >= 0xff00 && ucs <= 0xff5f) || /* Fullwidth Forms */
+ (ucs >= 0xffe0 && ucs <= 0xffe6) ||
+ (ucs >= 0x20000 && ucs <= 0x2ffff)));
+}
+
+/*
+ * Convert a UTF-8 character to a Unicode code point.
+ * This is a one-character version of pg_utf2wchar_with_len.
+ *
+ * No error checks here, c must point to a long-enough string.
+ */
+pg_wchar
+utf8_to_unicode(const unsigned char *c)
+{
+ if ((*c & 0x80) == 0)
+ return (pg_wchar) c[0];
+ else if ((*c & 0xe0) == 0xc0)
+ return (pg_wchar) (((c[0] & 0x1f) << 6) |
+ (c[1] & 0x3f));
+ else if ((*c & 0xf0) == 0xe0)
+ return (pg_wchar) (((c[0] & 0x0f) << 12) |
+ ((c[1] & 0x3f) << 6) |
+ (c[2] & 0x3f));
+ else if ((*c & 0xf8) == 0xf0)
+ return (pg_wchar) (((c[0] & 0x07) << 18) |
+ ((c[1] & 0x3f) << 12) |
+ ((c[2] & 0x3f) << 6) |
+ (c[3] & 0x3f));
+ else
+ /* that is an invalid code on purpose */
+ return 0xffffffff;
+}
+
+static int
+pg_utf_dsplen(const unsigned char *s)
+{
+ return ucs_wcwidth(utf8_to_unicode(s));
+}
+
+/*
+ * convert mule internal code to pg_wchar
+ * caller should allocate enough space for "to"
+ * len: length of from.
+ * "from" not necessarily null terminated.
+ */
+static int
+pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
+{
+ int cnt = 0;
+
+ while (len > 0 && *from)
+ {
+ if (IS_LC1(*from) && len >= 2)
+ {
+ *to = *from++ << 16;
+ *to |= *from++;
+ len -= 2;
+ }
+ else if (IS_LCPRV1(*from) && len >= 3)
+ {
+ from++;
+ *to = *from++ << 16;
+ *to |= *from++;
+ len -= 3;
+ }
+ else if (IS_LC2(*from) && len >= 3)
+ {
+ *to = *from++ << 16;
+ *to |= *from++ << 8;
+ *to |= *from++;
+ len -= 3;
+ }
+ else if (IS_LCPRV2(*from) && len >= 4)
+ {
+ from++;
+ *to = *from++ << 16;
+ *to |= *from++ << 8;
+ *to |= *from++;
+ len -= 4;
+ }
+ else
+ { /* assume ASCII */
+ *to = (unsigned char) *from++;
+ len--;
+ }
+ to++;
+ cnt++;
+ }
+ *to = 0;
+ return cnt;
+}
+
+/*
+ * convert pg_wchar to mule internal code
+ * caller should allocate enough space for "to"
+ * len: length of from.
+ * "from" not necessarily null terminated.
+ */
+static int
+pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
+{
+ int cnt = 0;
+
+ while (len > 0 && *from)
+ {
+ unsigned char lb;
+
+ lb = (*from >> 16) & 0xff;
+ if (IS_LC1(lb))
+ {
+ *to++ = lb;
+ *to++ = *from & 0xff;
+ cnt += 2;
+ }
+ else if (IS_LC2(lb))
+ {
+ *to++ = lb;
+ *to++ = (*from >> 8) & 0xff;
+ *to++ = *from & 0xff;
+ cnt += 3;
+ }
+ else if (IS_LCPRV1_A_RANGE(lb))
+ {
+ *to++ = LCPRV1_A;
+ *to++ = lb;
+ *to++ = *from & 0xff;
+ cnt += 3;
+ }
+ else if (IS_LCPRV1_B_RANGE(lb))
+ {
+ *to++ = LCPRV1_B;
+ *to++ = lb;
+ *to++ = *from & 0xff;
+ cnt += 3;
+ }
+ else if (IS_LCPRV2_A_RANGE(lb))
+ {
+ *to++ = LCPRV2_A;
+ *to++ = lb;
+ *to++ = (*from >> 8) & 0xff;
+ *to++ = *from & 0xff;
+ cnt += 4;
+ }
+ else if (IS_LCPRV2_B_RANGE(lb))
+ {
+ *to++ = LCPRV2_B;
+ *to++ = lb;
+ *to++ = (*from >> 8) & 0xff;
+ *to++ = *from & 0xff;
+ cnt += 4;
+ }
+ else
+ {
+ *to++ = *from & 0xff;
+ cnt += 1;
+ }
+ from++;
+ len--;
+ }
+ *to = 0;
+ return cnt;
+}
+
+/* exported for direct use by conv.c */
+int
+pg_mule_mblen(const unsigned char *s)
+{
+ int len;
+
+ if (IS_LC1(*s))
+ len = 2;
+ else if (IS_LCPRV1(*s))
+ len = 3;
+ else if (IS_LC2(*s))
+ len = 3;
+ else if (IS_LCPRV2(*s))
+ len = 4;
+ else
+ len = 1; /* assume ASCII */
+ return len;
+}
+
+static int
+pg_mule_dsplen(const unsigned char *s)
+{
+ int len;
+
+ /*
+ * Note: it's not really appropriate to assume that all multibyte charsets
+ * are double-wide on screen. But this seems an okay approximation for
+ * the MULE charsets we currently support.
+ */
+
+ if (IS_LC1(*s))
+ len = 1;
+ else if (IS_LCPRV1(*s))
+ len = 1;
+ else if (IS_LC2(*s))
+ len = 2;
+ else if (IS_LCPRV2(*s))
+ len = 2;
+ else
+ len = 1; /* assume ASCII */
+
+ return len;
+}
+
+/*
+ * ISO8859-1
+ */
+static int
+pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
+{
+ int cnt = 0;
+
+ while (len > 0 && *from)
+ {
+ *to++ = *from++;
+ len--;
+ cnt++;
+ }
+ *to = 0;
+ return cnt;
+}
+
+/*
+ * Trivial conversion from pg_wchar to single byte encoding. Just ignores
+ * high bits.
+ * caller should allocate enough space for "to"
+ * len: length of from.
+ * "from" not necessarily null terminated.
+ */
+static int
+pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
+{
+ int cnt = 0;
+
+ while (len > 0 && *from)
+ {
+ *to++ = *from++;
+ len--;
+ cnt++;
+ }
+ *to = 0;
+ return cnt;
+}
+
+static int
+pg_latin1_mblen(const unsigned char *s)
+{
+ return 1;
+}
+
+static int
+pg_latin1_dsplen(const unsigned char *s)
+{
+ return pg_ascii_dsplen(s);
+}
+
+/*
+ * SJIS
+ */
+static int
+pg_sjis_mblen(const unsigned char *s)
+{
+ int len;
+
+ if (*s >= 0xa1 && *s <= 0xdf)
+ len = 1; /* 1 byte kana? */
+ else if (IS_HIGHBIT_SET(*s))
+ len = 2; /* kanji? */
+ else
+ len = 1; /* should be ASCII */
+ return len;
+}
+
+static int
+pg_sjis_dsplen(const unsigned char *s)
+{
+ int len;
+
+ if (*s >= 0xa1 && *s <= 0xdf)
+ len = 1; /* 1 byte kana? */
+ else if (IS_HIGHBIT_SET(*s))
+ len = 2; /* kanji? */
+ else
+ len = pg_ascii_dsplen(s); /* should be ASCII */
+ return len;
+}
+
+/*
+ * Big5
+ */
+static int
+pg_big5_mblen(const unsigned char *s)
+{
+ int len;
+
+ if (IS_HIGHBIT_SET(*s))
+ len = 2; /* kanji? */
+ else
+ len = 1; /* should be ASCII */
+ return len;
+}
+
+static int
+pg_big5_dsplen(const unsigned char *s)
+{
+ int len;
+
+ if (IS_HIGHBIT_SET(*s))
+ len = 2; /* kanji? */
+ else
+ len = pg_ascii_dsplen(s); /* should be ASCII */
+ return len;
+}
+
+/*
+ * GBK
+ */
+static int
+pg_gbk_mblen(const unsigned char *s)
+{
+ int len;
+
+ if (IS_HIGHBIT_SET(*s))
+ len = 2; /* kanji? */
+ else
+ len = 1; /* should be ASCII */
+ return len;
+}
+
+static int
+pg_gbk_dsplen(const unsigned char *s)
+{
+ int len;
+
+ if (IS_HIGHBIT_SET(*s))
+ len = 2; /* kanji? */
+ else
+ len = pg_ascii_dsplen(s); /* should be ASCII */
+ return len;
+}
+
+/*
+ * UHC
+ */
+static int
+pg_uhc_mblen(const unsigned char *s)
+{
+ int len;
+
+ if (IS_HIGHBIT_SET(*s))
+ len = 2; /* 2byte? */
+ else
+ len = 1; /* should be ASCII */
+ return len;
+}
+
+static int
+pg_uhc_dsplen(const unsigned char *s)
+{
+ int len;
+
+ if (IS_HIGHBIT_SET(*s))
+ len = 2; /* 2byte? */
+ else
+ len = pg_ascii_dsplen(s); /* should be ASCII */
+ return len;
+}
+
+/*
+ * GB18030
+ * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
+ */
+
+/*
+ * Unlike all other mblen() functions, this also looks at the second byte of
+ * the input. However, if you only pass the first byte of a multi-byte
+ * string, and \0 as the second byte, this still works in a predictable way:
+ * a 4-byte character will be reported as two 2-byte characters. That's
+ * enough for all current uses, as a client-only encoding. It works that
+ * way, because in any valid 4-byte GB18030-encoded character, the third and
+ * fourth byte look like a 2-byte encoded character, when looked at
+ * separately.
+ */
+static int
+pg_gb18030_mblen(const unsigned char *s)
+{
+ int len;
+
+ if (!IS_HIGHBIT_SET(*s))
+ len = 1; /* ASCII */
+ else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
+ len = 4;
+ else
+ len = 2;
+ return len;
+}
+
+static int
+pg_gb18030_dsplen(const unsigned char *s)
+{
+ int len;
+
+ if (IS_HIGHBIT_SET(*s))
+ len = 2;
+ else
+ len = pg_ascii_dsplen(s); /* ASCII */
+ return len;
+}
+
+/*
+ *-------------------------------------------------------------------
+ * multibyte sequence validators
+ *
+ * These functions accept "s", a pointer to the first byte of a string,
+ * and "len", the remaining length of the string. If there is a validly
+ * encoded character beginning at *s, return its length in bytes; else
+ * return -1.
+ *
+ * The functions can assume that len > 0 and that *s != '\0', but they must
+ * test for and reject zeroes in any additional bytes of a multibyte character.
+ *
+ * Note that this definition allows the function for a single-byte
+ * encoding to be just "return 1".
+ *-------------------------------------------------------------------
+ */
+
+static int
+pg_ascii_verifier(const unsigned char *s, int len)
+{
+ return 1;
+}
+
+#define IS_EUC_RANGE_VALID(c) ((c) >= 0xa1 && (c) <= 0xfe)
+
+static int
+pg_eucjp_verifier(const unsigned char *s, int len)
+{
+ int l;
+ unsigned char c1,
+ c2;
+
+ c1 = *s++;
+
+ switch (c1)
+ {
+ case SS2: /* JIS X 0201 */
+ l = 2;
+ if (l > len)
+ return -1;
+ c2 = *s++;
+ if (c2 < 0xa1 || c2 > 0xdf)
+ return -1;
+ break;
+
+ case SS3: /* JIS X 0212 */
+ l = 3;
+ if (l > len)
+ return -1;
+ c2 = *s++;
+ if (!IS_EUC_RANGE_VALID(c2))
+ return -1;
+ c2 = *s++;
+ if (!IS_EUC_RANGE_VALID(c2))
+ return -1;
+ break;
+
+ default:
+ if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
+ {
+ l = 2;
+ if (l > len)
+ return -1;
+ if (!IS_EUC_RANGE_VALID(c1))
+ return -1;
+ c2 = *s++;
+ if (!IS_EUC_RANGE_VALID(c2))
+ return -1;
+ }
+ else
+ /* must be ASCII */
+ {
+ l = 1;
+ }
+ break;
+ }
+
+ return l;
+}
+
+static int
+pg_euckr_verifier(const unsigned char *s, int len)
+{
+ int l;
+ unsigned char c1,
+ c2;
+
+ c1 = *s++;
+
+ if (IS_HIGHBIT_SET(c1))
+ {
+ l = 2;
+ if (l > len)
+ return -1;
+ if (!IS_EUC_RANGE_VALID(c1))
+ return -1;
+ c2 = *s++;
+ if (!IS_EUC_RANGE_VALID(c2))
+ return -1;
+ }
+ else
+ /* must be ASCII */
+ {
+ l = 1;
+ }
+
+ return l;
+}
+
+/* EUC-CN byte sequences are exactly same as EUC-KR */
+#define pg_euccn_verifier pg_euckr_verifier
+
+static int
+pg_euctw_verifier(const unsigned char *s, int len)
+{
+ int l;
+ unsigned char c1,
+ c2;
+
+ c1 = *s++;
+
+ switch (c1)
+ {
+ case SS2: /* CNS 11643 Plane 1-7 */
+ l = 4;
+ if (l > len)
+ return -1;
+ c2 = *s++;
+ if (c2 < 0xa1 || c2 > 0xa7)
+ return -1;
+ c2 = *s++;
+ if (!IS_EUC_RANGE_VALID(c2))
+ return -1;
+ c2 = *s++;
+ if (!IS_EUC_RANGE_VALID(c2))
+ return -1;
+ break;
+
+ case SS3: /* unused */
+ return -1;
+
+ default:
+ if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
+ {
+ l = 2;
+ if (l > len)
+ return -1;
+ /* no further range check on c1? */
+ c2 = *s++;
+ if (!IS_EUC_RANGE_VALID(c2))
+ return -1;
+ }
+ else
+ /* must be ASCII */
+ {
+ l = 1;
+ }
+ break;
+ }
+ return l;
+}
+
+static int
+pg_johab_verifier(const unsigned char *s, int len)
+{
+ int l,
+ mbl;
+ unsigned char c;
+
+ l = mbl = pg_johab_mblen(s);
+
+ if (len < l)
+ return -1;
+
+ if (!IS_HIGHBIT_SET(*s))
+ return mbl;
+
+ while (--l > 0)
+ {
+ c = *++s;
+ if (!IS_EUC_RANGE_VALID(c))
+ return -1;
+ }
+ return mbl;
+}
+
+static int
+pg_mule_verifier(const unsigned char *s, int len)
+{
+ int l,
+ mbl;
+ unsigned char c;
+
+ l = mbl = pg_mule_mblen(s);
+
+ if (len < l)
+ return -1;
+
+ while (--l > 0)
+ {
+ c = *++s;
+ if (!IS_HIGHBIT_SET(c))
+ return -1;
+ }
+ return mbl;
+}
+
+static int
+pg_latin1_verifier(const unsigned char *s, int len)
+{
+ return 1;
+}
+
+static int
+pg_sjis_verifier(const unsigned char *s, int len)
+{
+ int l,
+ mbl;
+ unsigned char c1,
+ c2;
+
+ l = mbl = pg_sjis_mblen(s);
+
+ if (len < l)
+ return -1;
+
+ if (l == 1) /* pg_sjis_mblen already verified it */
+ return mbl;
+
+ c1 = *s++;
+ c2 = *s;
+ if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
+ return -1;
+ return mbl;
+}
+
+static int
+pg_big5_verifier(const unsigned char *s, int len)
+{
+ int l,
+ mbl;
+
+ l = mbl = pg_big5_mblen(s);
+
+ if (len < l)
+ return -1;
+
+ if (l == 2 &&
+ s[0] == NONUTF8_INVALID_BYTE0 &&
+ s[1] == NONUTF8_INVALID_BYTE1)
+ return -1;
+
+ while (--l > 0)
+ {
+ if (*++s == '\0')
+ return -1;
+ }
+
+ return mbl;
+}
+
+static int
+pg_gbk_verifier(const unsigned char *s, int len)
+{
+ int l,
+ mbl;
+
+ l = mbl = pg_gbk_mblen(s);
+
+ if (len < l)
+ return -1;
+
+ if (l == 2 &&
+ s[0] == NONUTF8_INVALID_BYTE0 &&
+ s[1] == NONUTF8_INVALID_BYTE1)
+ return -1;
+
+ while (--l > 0)
+ {
+ if (*++s == '\0')
+ return -1;
+ }
+
+ return mbl;
+}
+
+static int
+pg_uhc_verifier(const unsigned char *s, int len)
+{
+ int l,
+ mbl;
+
+ l = mbl = pg_uhc_mblen(s);
+
+ if (len < l)
+ return -1;
+
+ if (l == 2 &&
+ s[0] == NONUTF8_INVALID_BYTE0 &&
+ s[1] == NONUTF8_INVALID_BYTE1)
+ return -1;
+
+ while (--l > 0)
+ {
+ if (*++s == '\0')
+ return -1;
+ }
+
+ return mbl;
+}
+
+static int
+pg_gb18030_verifier(const unsigned char *s, int len)
+{
+ int l;
+
+ if (!IS_HIGHBIT_SET(*s))
+ l = 1; /* ASCII */
+ else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
+ {
+ /* Should be 4-byte, validate remaining bytes */
+ if (*s >= 0x81 && *s <= 0xfe &&
+ *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
+ *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
+ l = 4;
+ else
+ l = -1;
+ }
+ else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
+ {
+ /* Should be 2-byte, validate */
+ if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
+ (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
+ l = 2;
+ else
+ l = -1;
+ }
+ else
+ l = -1;
+ return l;
+}
+
+static int
+pg_utf8_verifier(const unsigned char *s, int len)
+{
+ int l = pg_utf_mblen(s);
+
+ if (len < l)
+ return -1;
+
+ if (!pg_utf8_islegal(s, l))
+ return -1;
+
+ return l;
+}
+
+/*
+ * Check for validity of a single UTF-8 encoded character
+ *
+ * This directly implements the rules in RFC3629. The bizarre-looking
+ * restrictions on the second byte are meant to ensure that there isn't
+ * more than one encoding of a given Unicode character point; that is,
+ * you may not use a longer-than-necessary byte sequence with high order
+ * zero bits to represent a character that would fit in fewer bytes.
+ * To do otherwise is to create security hazards (eg, create an apparent
+ * non-ASCII character that decodes to plain ASCII).
+ *
+ * length is assumed to have been obtained by pg_utf_mblen(), and the
+ * caller must have checked that that many bytes are present in the buffer.
+ */
+bool
+pg_utf8_islegal(const unsigned char *source, int length)
+{
+ unsigned char a;
+
+ switch (length)
+ {
+ default:
+ /* reject lengths 5 and 6 for now */
+ return false;
+ case 4:
+ a = source[3];
+ if (a < 0x80 || a > 0xBF)
+ return false;
+ /* FALL THRU */
+ case 3:
+ a = source[2];
+ if (a < 0x80 || a > 0xBF)
+ return false;
+ /* FALL THRU */
+ case 2:
+ a = source[1];
+ switch (*source)
+ {
+ case 0xE0:
+ if (a < 0xA0 || a > 0xBF)
+ return false;
+ break;
+ case 0xED:
+ if (a < 0x80 || a > 0x9F)
+ return false;
+ break;
+ case 0xF0:
+ if (a < 0x90 || a > 0xBF)
+ return false;
+ break;
+ case 0xF4:
+ if (a < 0x80 || a > 0x8F)
+ return false;
+ break;
+ default:
+ if (a < 0x80 || a > 0xBF)
+ return false;
+ break;
+ }
+ /* FALL THRU */
+ case 1:
+ a = *source;
+ if (a >= 0x80 && a < 0xC2)
+ return false;
+ if (a > 0xF4)
+ return false;
+ break;
+ }
+ return true;
+}
+
+
+/*
+ * Fills the provided buffer with two bytes such that:
+ * pg_encoding_mblen(dst) == 2 && pg_encoding_verifymbstr(dst) == 0
+ */
+void
+pg_encoding_set_invalid(int encoding, char *dst)
+{
+ Assert(pg_encoding_max_length(encoding) > 1);
+
+ dst[0] = (encoding == PG_UTF8 ? 0xc0 : NONUTF8_INVALID_BYTE0);
+ dst[1] = NONUTF8_INVALID_BYTE1;
+}
+
+/*
+ *-------------------------------------------------------------------
+ * encoding info table
+ * XXX must be sorted by the same order as enum pg_enc (in mb/pg_wchar.h)
+ *-------------------------------------------------------------------
+ */
+const pg_wchar_tbl pg_wchar_table[] = {
+ {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifier, 1}, /* PG_SQL_ASCII */
+ {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3}, /* PG_EUC_JP */
+ {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifier, 2}, /* PG_EUC_CN */
+ {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifier, 3}, /* PG_EUC_KR */
+ {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifier, 4}, /* PG_EUC_TW */
+ {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3}, /* PG_EUC_JIS_2004 */
+ {pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifier, 4}, /* PG_UTF8 */
+ {pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifier, 4}, /* PG_MULE_INTERNAL */
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN1 */
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN2 */
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN3 */
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN4 */
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN5 */
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN6 */
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN7 */
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN8 */
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN9 */
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN10 */
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1256 */
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1258 */
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN866 */
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN874 */
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_KOI8R */
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1251 */
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1252 */
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-5 */
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-6 */
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-7 */
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-8 */
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1250 */
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1253 */
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1254 */
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1255 */
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1257 */
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_KOI8U */
+ {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2}, /* PG_SJIS */
+ {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifier, 2}, /* PG_BIG5 */
+ {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifier, 2}, /* PG_GBK */
+ {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifier, 2}, /* PG_UHC */
+ {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifier, 4}, /* PG_GB18030 */
+ {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifier, 3}, /* PG_JOHAB */
+ {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2} /* PG_SHIFT_JIS_2004 */
+};
+
+/*
+ * Returns the byte length of a multibyte character.
+ *
+ * Caution: when dealing with text that is not certainly valid in the
+ * specified encoding, the result may exceed the actual remaining
+ * string length. Callers that are not prepared to deal with that
+ * should use pg_encoding_mblen_bounded() instead.
+ */
+int
+pg_encoding_mblen(int encoding, const char *mbstr)
+{
+ return (PG_VALID_ENCODING(encoding) ?
+ pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
+ pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
+}
+
+/*
+ * Returns the byte length of a multibyte character; but not more than
+ * the distance to end of string.
+ */
+int
+pg_encoding_mblen_bounded(int encoding, const char *mbstr)
+{
+ return strnlen(mbstr, pg_encoding_mblen(encoding, mbstr));
+}
+
+/*
+ * Returns the display length of a multibyte character.
+ */
+int
+pg_encoding_dsplen(int encoding, const char *mbstr)
+{
+ return (PG_VALID_ENCODING(encoding) ?
+ pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
+ pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
+}
+
+/*
+ * Verify the first multibyte character of the given string.
+ * Return its byte length if good, -1 if bad. (See comments above for
+ * full details of the mbverify API.)
+ */
+int
+pg_encoding_verifymb(int encoding, const char *mbstr, int len)
+{
+ return (PG_VALID_ENCODING(encoding) ?
+ pg_wchar_table[encoding].mbverify((const unsigned char *) mbstr, len) :
+ pg_wchar_table[PG_SQL_ASCII].mbverify((const unsigned char *) mbstr, len));
+}
+
+/* v14+ function name, for easier backpatching */
+int
+pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
+{
+ int ok_bytes = pg_encoding_verifymb(encoding, mbstr, len);
+
+ if (ok_bytes == 0)
+ return -1;
+ return ok_bytes;
+}
+
+/* replace v14+ function, adapted from pg_verify_mbstr_len */
+int
+pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
+{
+ mbverifier mbverify;
+ int ok_bytes;
+
+ Assert(PG_VALID_ENCODING(encoding));
+
+ /*
+ * In single-byte encodings, we need only reject nulls (\0).
+ */
+ if (pg_encoding_max_length(encoding) <= 1)
+ {
+ const char *nullpos = memchr(mbstr, 0, len);
+
+ if (nullpos == NULL)
+ return len;
+ return nullpos - mbstr;
+ }
+
+ /* fetch function pointer just once */
+ mbverify = pg_wchar_table[encoding].mbverify;
+
+ ok_bytes = 0;
+
+ while (len > 0)
+ {
+ int l;
+
+ /* fast path for ASCII-subset characters */
+ if (!IS_HIGHBIT_SET(*mbstr))
+ {
+ if (*mbstr != '\0')
+ {
+ ok_bytes++;
+ mbstr++;
+ len--;
+ continue;
+ }
+ return ok_bytes;
+ }
+
+ l = (*mbverify) ((const unsigned char *) mbstr, len);
+
+ if (l < 0)
+ return ok_bytes;
+
+ mbstr += l;
+ len -= l;
+ ok_bytes += l;
+ }
+ return ok_bytes;
+}
+
+/*
+ * fetch maximum length of a given encoding
+ */
+int
+pg_encoding_max_length(int encoding)
+{
+ Assert(PG_VALID_ENCODING(encoding));
+
+ /*
+ * Check for the encoding despite the assert, due to some mingw versions
+ * otherwise issuing bogus warnings.
+ */
+ return PG_VALID_ENCODING(encoding) ?
+ pg_wchar_table[encoding].maxmblen :
+ pg_wchar_table[PG_SQL_ASCII].maxmblen;
+}
diff --git a/src/include/common/unicode_combining_table.h b/src/include/common/unicode_combining_table.h
new file mode 100644
index 00000000000..a9f10c31bc8
--- /dev/null
+++ b/src/include/common/unicode_combining_table.h
@@ -0,0 +1,196 @@
+/* generated by src/common/unicode/generate-unicode_combining_table.pl, do not edit */
+
+static const struct mbinterval combining[] = {
+ {0x0300, 0x036F},
+ {0x0483, 0x0489},
+ {0x0591, 0x05BD},
+ {0x05BF, 0x05BF},
+ {0x05C1, 0x05C2},
+ {0x05C4, 0x05C5},
+ {0x05C7, 0x05C7},
+ {0x0610, 0x061A},
+ {0x064B, 0x065F},
+ {0x0670, 0x0670},
+ {0x06D6, 0x06DC},
+ {0x06DF, 0x06E4},
+ {0x06E7, 0x06E8},
+ {0x06EA, 0x06ED},
+ {0x0711, 0x0711},
+ {0x0730, 0x074A},
+ {0x07A6, 0x07B0},
+ {0x07EB, 0x07F3},
+ {0x07FD, 0x07FD},
+ {0x0816, 0x0819},
+ {0x081B, 0x0823},
+ {0x0825, 0x0827},
+ {0x0829, 0x082D},
+ {0x0859, 0x085B},
+ {0x08D3, 0x08E1},
+ {0x08E3, 0x0902},
+ {0x093A, 0x093A},
+ {0x093C, 0x093C},
+ {0x0941, 0x0948},
+ {0x094D, 0x094D},
+ {0x0951, 0x0957},
+ {0x0962, 0x0963},
+ {0x0981, 0x0981},
+ {0x09BC, 0x09BC},
+ {0x09C1, 0x09C4},
+ {0x09CD, 0x09CD},
+ {0x09E2, 0x09E3},
+ {0x09FE, 0x0A02},
+ {0x0A3C, 0x0A3C},
+ {0x0A41, 0x0A51},
+ {0x0A70, 0x0A71},
+ {0x0A75, 0x0A75},
+ {0x0A81, 0x0A82},
+ {0x0ABC, 0x0ABC},
+ {0x0AC1, 0x0AC8},
+ {0x0ACD, 0x0ACD},
+ {0x0AE2, 0x0AE3},
+ {0x0AFA, 0x0B01},
+ {0x0B3C, 0x0B3C},
+ {0x0B3F, 0x0B3F},
+ {0x0B41, 0x0B44},
+ {0x0B4D, 0x0B56},
+ {0x0B62, 0x0B63},
+ {0x0B82, 0x0B82},
+ {0x0BC0, 0x0BC0},
+ {0x0BCD, 0x0BCD},
+ {0x0C00, 0x0C00},
+ {0x0C04, 0x0C04},
+ {0x0C3E, 0x0C40},
+ {0x0C46, 0x0C56},
+ {0x0C62, 0x0C63},
+ {0x0C81, 0x0C81},
+ {0x0CBC, 0x0CBC},
+ {0x0CBF, 0x0CBF},
+ {0x0CC6, 0x0CC6},
+ {0x0CCC, 0x0CCD},
+ {0x0CE2, 0x0CE3},
+ {0x0D00, 0x0D01},
+ {0x0D3B, 0x0D3C},
+ {0x0D41, 0x0D44},
+ {0x0D4D, 0x0D4D},
+ {0x0D62, 0x0D63},
+ {0x0D81, 0x0D81},
+ {0x0DCA, 0x0DCA},
+ {0x0DD2, 0x0DD6},
+ {0x0E31, 0x0E31},
+ {0x0E34, 0x0E3A},
+ {0x0E47, 0x0E4E},
+ {0x0EB1, 0x0EB1},
+ {0x0EB4, 0x0EBC},
+ {0x0EC8, 0x0ECD},
+ {0x0F18, 0x0F19},
+ {0x0F35, 0x0F35},
+ {0x0F37, 0x0F37},
+ {0x0F39, 0x0F39},
+ {0x0F71, 0x0F7E},
+ {0x0F80, 0x0F84},
+ {0x0F86, 0x0F87},
+ {0x0F8D, 0x0FBC},
+ {0x0FC6, 0x0FC6},
+ {0x102D, 0x1030},
+ {0x1032, 0x1037},
+ {0x1039, 0x103A},
+ {0x103D, 0x103E},
+ {0x1058, 0x1059},
+ {0x105E, 0x1060},
+ {0x1071, 0x1074},
+ {0x1082, 0x1082},
+ {0x1085, 0x1086},
+ {0x108D, 0x108D},
+ {0x109D, 0x109D},
+ {0x135D, 0x135F},
+ {0x1712, 0x1714},
+ {0x1732, 0x1734},
+ {0x1752, 0x1753},
+ {0x1772, 0x1773},
+ {0x17B4, 0x17B5},
+ {0x17B7, 0x17BD},
+ {0x17C6, 0x17C6},
+ {0x17C9, 0x17D3},
+ {0x17DD, 0x17DD},
+ {0x180B, 0x180D},
+ {0x1885, 0x1886},
+ {0x18A9, 0x18A9},
+ {0x1920, 0x1922},
+ {0x1927, 0x1928},
+ {0x1932, 0x1932},
+ {0x1939, 0x193B},
+ {0x1A17, 0x1A18},
+ {0x1A1B, 0x1A1B},
+ {0x1A56, 0x1A56},
+ {0x1A58, 0x1A60},
+ {0x1A62, 0x1A62},
+ {0x1A65, 0x1A6C},
+ {0x1A73, 0x1A7F},
+ {0x1AB0, 0x1B03},
+ {0x1B34, 0x1B34},
+ {0x1B36, 0x1B3A},
+ {0x1B3C, 0x1B3C},
+ {0x1B42, 0x1B42},
+ {0x1B6B, 0x1B73},
+ {0x1B80, 0x1B81},
+ {0x1BA2, 0x1BA5},
+ {0x1BA8, 0x1BA9},
+ {0x1BAB, 0x1BAD},
+ {0x1BE6, 0x1BE6},
+ {0x1BE8, 0x1BE9},
+ {0x1BED, 0x1BED},
+ {0x1BEF, 0x1BF1},
+ {0x1C2C, 0x1C33},
+ {0x1C36, 0x1C37},
+ {0x1CD0, 0x1CD2},
+ {0x1CD4, 0x1CE0},
+ {0x1CE2, 0x1CE8},
+ {0x1CED, 0x1CED},
+ {0x1CF4, 0x1CF4},
+ {0x1CF8, 0x1CF9},
+ {0x1DC0, 0x1DFF},
+ {0x20D0, 0x20F0},
+ {0x2CEF, 0x2CF1},
+ {0x2D7F, 0x2D7F},
+ {0x2DE0, 0x2DFF},
+ {0x302A, 0x302D},
+ {0x3099, 0x309A},
+ {0xA66F, 0xA672},
+ {0xA674, 0xA67D},
+ {0xA69E, 0xA69F},
+ {0xA6F0, 0xA6F1},
+ {0xA802, 0xA802},
+ {0xA806, 0xA806},
+ {0xA80B, 0xA80B},
+ {0xA825, 0xA826},
+ {0xA82C, 0xA82C},
+ {0xA8C4, 0xA8C5},
+ {0xA8E0, 0xA8F1},
+ {0xA8FF, 0xA8FF},
+ {0xA926, 0xA92D},
+ {0xA947, 0xA951},
+ {0xA980, 0xA982},
+ {0xA9B3, 0xA9B3},
+ {0xA9B6, 0xA9B9},
+ {0xA9BC, 0xA9BD},
+ {0xA9E5, 0xA9E5},
+ {0xAA29, 0xAA2E},
+ {0xAA31, 0xAA32},
+ {0xAA35, 0xAA36},
+ {0xAA43, 0xAA43},
+ {0xAA4C, 0xAA4C},
+ {0xAA7C, 0xAA7C},
+ {0xAAB0, 0xAAB0},
+ {0xAAB2, 0xAAB4},
+ {0xAAB7, 0xAAB8},
+ {0xAABE, 0xAABF},
+ {0xAAC1, 0xAAC1},
+ {0xAAEC, 0xAAED},
+ {0xAAF6, 0xAAF6},
+ {0xABE5, 0xABE5},
+ {0xABE8, 0xABE8},
+ {0xABED, 0xABED},
+ {0xFB1E, 0xFB1E},
+ {0xFE00, 0xFE0F},
+ {0xFE20, 0xFE2F},
+};
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index 07b316fae1d..2bbdf2e792f 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -521,6 +521,10 @@ extern int pg_valid_server_encoding_id(int encoding);
* of them do exist inside libpq.
*/
extern void pg_encoding_set_invalid(int encoding, char *dst);
+extern int pg_encoding_verifymbchar(int encoding, const char *mbstr, int len);
+extern int pg_encoding_verifymbstr(int encoding, const char *mbstr, int len);
+extern int pg_encoding_verifymb(int encoding, const char *mbstr, int len);
+
extern int pg_mb2wchar(const char *from, pg_wchar *to);
extern int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len);
extern int pg_encoding_mb2wchar_with_len(int encoding,
diff --git a/src/interfaces/libpq/fe-exec.c b/src/interfaces/libpq/fe-exec.c
index a29d19a6268..dccee6a5597 100644
--- a/src/interfaces/libpq/fe-exec.c
+++ b/src/interfaces/libpq/fe-exec.c
@@ -132,6 +132,8 @@ static int check_field_number(const PGresult *res, int field_num);
#define PGRESULT_SEP_ALLOC_THRESHOLD (PGRESULT_DATA_BLOCKSIZE / 2)
+
+
/*
* PQmakeEmptyPGresult
* returns a newly allocated, initialized PGresult with given status.
@@ -3403,9 +3405,10 @@ PQescapeStringInternal(PGconn *conn,
if (error)
*error = 1;
if (conn)
- libpq_append_conn_error(conn, "incomplete multibyte character");
+ printfPQExpBuffer(&conn->errorMessage,
+ libpq_gettext("incomplete multibyte character\n"));
- pg_encoding_set_invalid(encoding, target);
+ pg_encoding_set_invalid(encoding, target);
target += 2;
source++;
remaining--;
--
2.39.5 (Apple Git-154)
From 27827fe62777a809cc3f5a54742839bc031b02f6 Mon Sep 17 00:00:00 2001
From: Filip Janus <fjanus@redhat.com>
Date: Tue, 18 Mar 2025 10:11:09 +0100
Subject: [PATCH 8/8] Fix failing dropdb.c
---
src/bin/scripts/dropdb.c | 4 ----
1 files changed, 0 insertion(+), 5 deletions(-)
diff --git a/src/bin/scripts/dropdb.c b/src/bin/scripts/dropdb.c
index ed3a2c8c19a..140982717d9 100644
--- a/src/bin/scripts/dropdb.c
+++ b/src/bin/scripts/dropdb.c
@@ -122,10 +122,6 @@ main(int argc, char *argv[])
exit(0);
}
- initPQExpBuffer(&sql);
-
- appendPQExpBuffer(&sql, "DROP DATABASE %s%s;",
- (if_exists ? "IF EXISTS " : ""), fmtIdEnc(dbname, PQclientEncoding(conn)));
/* Avoid trying to drop postgres db while we are connected to it. */
if (maintenance_db == NULL && strcmp(dbname, "postgres") == 0)
2.39.5 (Apple Git-154)