3671 lines
99 KiB
Diff
3671 lines
99 KiB
Diff
From 62235454d50a62138341a87be065e4681684753a Mon Sep 17 00:00:00 2001
|
|
From: Andres Freund <andres@anarazel.de>
|
|
Date: Mon, 10 Feb 2025 10:03:37 -0500
|
|
Subject: [PATCH 1/8] Backport upstream commit
|
|
4dc28963533704fc7dd922b9447467466a233d89 Add pg_encoding_set_invalid()
|
|
|
|
There are cases where we cannot / do not want to error out for invalidly
|
|
encoded input. In such cases it can be useful to replace e.g. an incomplete
|
|
multi-byte characters with bytes that will trigger an error when getting
|
|
validated as part of a larger string.
|
|
|
|
Unfortunately, until now, for some encoding no such sequence existed. For
|
|
those encodings this commit removes one previously accepted input combination
|
|
- we consider that to be ok, as the chosen bytes are outside of the valid
|
|
ranges for the encodings, we just previously failed to detect that.
|
|
|
|
As we cannot add a new field to pg_wchar_table without breaking ABI, this is
|
|
implemented "in-line" in the newly added function.
|
|
---
|
|
src/backend/utils/mb/wchar.c | 55 +++++++++++++++++++++++++++++++++++-
|
|
src/include/mb/pg_wchar.h | 1 +
|
|
2 files changed, 55 insertions(+), 1 deletion(-)
|
|
|
|
diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c
|
|
index 1b5ce1740c0..872241cc804 100644
|
|
--- a/src/backend/utils/mb/wchar.c
|
|
+++ b/src/backend/utils/mb/wchar.c
|
|
@@ -14,6 +14,25 @@
|
|
#include "mb/pg_wchar.h"
|
|
|
|
|
|
+/*
|
|
+ * In today's multibyte encodings other than UTF8, this two-byte sequence
|
|
+ * ensures pg_encoding_mblen() == 2 && pg_encoding_verifymbstr() == 0.
|
|
+ *
|
|
+ * For historical reasons, several verifychar implementations opt to reject
|
|
+ * this pair specifically. Byte pair range constraints, in encoding
|
|
+ * originator documentation, always excluded this pair. No core conversion
|
|
+ * could translate it. However, longstanding verifychar implementations
|
|
+ * accepted any non-NUL byte. big5_to_euc_tw and big5_to_mic even translate
|
|
+ * pairs not valid per encoding originator documentation. To avoid tightening
|
|
+ * core or non-core conversions in a security patch, we sought this one pair.
|
|
+ *
|
|
+ * PQescapeString() historically used spaces for BYTE1; many other values
|
|
+ * could suffice for BYTE1.
|
|
+ */
|
|
+#define NONUTF8_INVALID_BYTE0 (0x8d)
|
|
+#define NONUTF8_INVALID_BYTE1 (' ')
|
|
+
|
|
+
|
|
/*
|
|
* Operations on multi-byte encodings are driven by a table of helper
|
|
* functions.
|
|
@@ -1394,6 +1413,11 @@ pg_big5_verifier(const unsigned char *s, int len)
|
|
if (len < l)
|
|
return -1;
|
|
|
|
+ if (l == 2 &&
|
|
+ s[0] == NONUTF8_INVALID_BYTE0 &&
|
|
+ s[1] == NONUTF8_INVALID_BYTE1)
|
|
+ return -1;
|
|
+
|
|
while (--l > 0)
|
|
{
|
|
if (*++s == '\0')
|
|
@@ -1414,6 +1438,11 @@ pg_gbk_verifier(const unsigned char *s, int len)
|
|
if (len < l)
|
|
return -1;
|
|
|
|
+ if (l == 2 &&
|
|
+ s[0] == NONUTF8_INVALID_BYTE0 &&
|
|
+ s[1] == NONUTF8_INVALID_BYTE1)
|
|
+ return -1;
|
|
+
|
|
while (--l > 0)
|
|
{
|
|
if (*++s == '\0')
|
|
@@ -1434,6 +1463,11 @@ pg_uhc_verifier(const unsigned char *s, int len)
|
|
if (len < l)
|
|
return -1;
|
|
|
|
+ if (l == 2 &&
|
|
+ s[0] == NONUTF8_INVALID_BYTE0 &&
|
|
+ s[1] == NONUTF8_INVALID_BYTE1)
|
|
+ return -1;
|
|
+
|
|
while (--l > 0)
|
|
{
|
|
if (*++s == '\0')
|
|
@@ -1768,6 +1802,19 @@ pg_eucjp_increment(unsigned char *charptr, int length)
|
|
#endif /* !FRONTEND */
|
|
|
|
|
|
+/*
|
|
+ * Fills the provided buffer with two bytes such that:
|
|
+ * pg_encoding_mblen(dst) == 2 && pg_encoding_verifymbstr(dst) == 0
|
|
+ */
|
|
+void
|
|
+pg_encoding_set_invalid(int encoding, char *dst)
|
|
+{
|
|
+ Assert(pg_encoding_max_length(encoding) > 1);
|
|
+
|
|
+ dst[0] = (encoding == PG_UTF8 ? 0xc0 : NONUTF8_INVALID_BYTE0);
|
|
+ dst[1] = NONUTF8_INVALID_BYTE1;
|
|
+}
|
|
+
|
|
/*
|
|
*-------------------------------------------------------------------
|
|
* encoding info table
|
|
@@ -1869,7 +1916,13 @@ pg_encoding_max_length(int encoding)
|
|
{
|
|
Assert(PG_VALID_ENCODING(encoding));
|
|
|
|
- return pg_wchar_table[encoding].maxmblen;
|
|
+ /*
|
|
+ * Check for the encoding despite the assert, due to some mingw versions
|
|
+ * otherwise issuing bogus warnings.
|
|
+ */
|
|
+ return PG_VALID_ENCODING(encoding) ?
|
|
+ pg_wchar_table[encoding].maxmblen :
|
|
+ pg_wchar_table[PG_SQL_ASCII].maxmblen;
|
|
}
|
|
|
|
#ifndef FRONTEND
|
|
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
|
|
index ec101a834ef..07b316fae1d 100644
|
|
--- a/src/include/mb/pg_wchar.h
|
|
+++ b/src/include/mb/pg_wchar.h
|
|
@@ -520,6 +520,7 @@ extern int pg_valid_server_encoding_id(int encoding);
|
|
* Remaining functions are not considered part of libpq's API, though many
|
|
* of them do exist inside libpq.
|
|
*/
|
|
+extern void pg_encoding_set_invalid(int encoding, char *dst);
|
|
extern int pg_mb2wchar(const char *from, pg_wchar *to);
|
|
extern int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len);
|
|
extern int pg_encoding_mb2wchar_with_len(int encoding,
|
|
--
|
|
2.39.5 (Apple Git-154)
|
|
|
|
|
|
From 581adbfe8c9db2e641705b308a74e5b6d89c61a6 Mon Sep 17 00:00:00 2001
|
|
From: Andres Freund <andres@anarazel.de>
|
|
Date: Mon, 10 Feb 2025 10:03:37 -0500
|
|
Subject: [PATCH 2/8] Backport upstream
|
|
commit:3e98c8ce50e46d58b91bf3ea806e995296dc5b91 Specify the encoding of input
|
|
to fmtId()
|
|
|
|
This commit adds fmtIdEnc() and fmtQualifiedIdEnc(), which allow to specify
|
|
the encoding as an explicit argument. Additionally setFmtEncoding() is
|
|
provided, which defines the encoding when no explicit encoding is provided, to
|
|
avoid breaking all code using fmtId().
|
|
|
|
All users of fmtId()/fmtQualifiedId() are either converted to the explicit
|
|
version or a call to setFmtEncoding() has been added.
|
|
|
|
This commit does not yet utilize the now well-defined encoding, that will
|
|
happen in a subsequent commit.
|
|
---
|
|
src/bin/pg_dump/pg_backup_archiver.c | 1 +
|
|
src/bin/pg_dump/pg_dump.c | 1 +
|
|
src/bin/pg_dump/pg_dumpall.c | 1 +
|
|
src/bin/psql/command.c | 3 +
|
|
src/bin/scripts/common.c | 5 +-
|
|
src/bin/scripts/createdb.c | 2 +
|
|
src/bin/scripts/createuser.c | 2 +
|
|
src/bin/scripts/dropdb.c | 8 ++-
|
|
src/bin/scripts/dropuser.c | 3 +-
|
|
src/bin/scripts/reindexdb.c | 4 +-
|
|
src/bin/scripts/vacuumdb.c | 5 +-
|
|
src/fe_utils/string_utils.c | 84 ++++++++++++++++++++++++++--
|
|
src/include/fe_utils/string_utils.h | 5 +-
|
|
13 files changed, 109 insertions(+), 15 deletions(-)
|
|
|
|
diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c
|
|
index 6476f7119af..489a84aca3d 100644
|
|
--- a/src/bin/pg_dump/pg_backup_archiver.c
|
|
+++ b/src/bin/pg_dump/pg_backup_archiver.c
|
|
@@ -2731,6 +2731,7 @@ processEncodingEntry(ArchiveHandle *AH, TocEntry *te)
|
|
fatal("unrecognized encoding \"%s\"",
|
|
ptr1);
|
|
AH->public.encoding = encoding;
|
|
+ setFmtEncoding(encoding);
|
|
}
|
|
else
|
|
fatal("invalid ENCODING item: %s",
|
|
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
|
|
index 67a3714c62c..53fc95f3033 100644
|
|
--- a/src/bin/pg_dump/pg_dump.c
|
|
+++ b/src/bin/pg_dump/pg_dump.c
|
|
@@ -1085,6 +1085,7 @@ setup_connection(Archive *AH, const char *dumpencoding,
|
|
* we know how to escape strings.
|
|
*/
|
|
AH->encoding = PQclientEncoding(conn);
|
|
+ setFmtEncoding(AH->encoding);
|
|
|
|
std_strings = PQparameterStatus(conn, "standard_conforming_strings");
|
|
AH->std_strings = (std_strings && strcmp(std_strings, "on") == 0);
|
|
diff --git a/src/bin/pg_dump/pg_dumpall.c b/src/bin/pg_dump/pg_dumpall.c
|
|
index 27093220ab9..a44cd765c45 100644
|
|
--- a/src/bin/pg_dump/pg_dumpall.c
|
|
+++ b/src/bin/pg_dump/pg_dumpall.c
|
|
@@ -508,6 +508,7 @@ main(int argc, char *argv[])
|
|
* we know how to escape strings.
|
|
*/
|
|
encoding = PQclientEncoding(conn);
|
|
+ setFmtEncoding(encoding);
|
|
std_strings = PQparameterStatus(conn, "standard_conforming_strings");
|
|
if (!std_strings)
|
|
std_strings = "off";
|
|
diff --git a/src/bin/psql/command.c b/src/bin/psql/command.c
|
|
index 8889f833714..66e7fb4cf31 100644
|
|
--- a/src/bin/psql/command.c
|
|
+++ b/src/bin/psql/command.c
|
|
@@ -1183,6 +1183,7 @@ exec_command_encoding(PsqlScanState scan_state, bool active_branch)
|
|
/* save encoding info into psql internal data */
|
|
pset.encoding = PQclientEncoding(pset.db);
|
|
pset.popt.topt.encoding = pset.encoding;
|
|
+ setFmtEncoding(pset.encoding);
|
|
SetVariable(pset.vars, "ENCODING",
|
|
pg_encoding_to_char(pset.encoding));
|
|
}
|
|
@@ -3467,6 +3468,8 @@ SyncVariables(void)
|
|
pset.popt.topt.encoding = pset.encoding;
|
|
pset.sversion = PQserverVersion(pset.db);
|
|
|
|
+ setFmtEncoding(pset.encoding);
|
|
+
|
|
SetVariable(pset.vars, "DBNAME", PQdb(pset.db));
|
|
SetVariable(pset.vars, "USER", PQuser(pset.db));
|
|
SetVariable(pset.vars, "HOST", PQhost(pset.db));
|
|
diff --git a/src/bin/scripts/common.c b/src/bin/scripts/common.c
|
|
index 2de696c19ef..ff79421a31d 100644
|
|
--- a/src/bin/scripts/common.c
|
|
+++ b/src/bin/scripts/common.c
|
|
@@ -376,8 +376,9 @@ appendQualifiedRelation(PQExpBuffer buf, const char *spec,
|
|
exit(1);
|
|
}
|
|
appendPQExpBufferStr(buf,
|
|
- fmtQualifiedId(PQgetvalue(res, 0, 1),
|
|
- PQgetvalue(res, 0, 0)));
|
|
+ fmtQualifiedIdEnc(PQgetvalue(res, 0, 1),
|
|
+ PQgetvalue(res, 0, 0),
|
|
+ PQclientEncoding(conn)));
|
|
appendPQExpBufferStr(buf, columns);
|
|
PQclear(res);
|
|
termPQExpBuffer(&sql);
|
|
diff --git a/src/bin/scripts/createdb.c b/src/bin/scripts/createdb.c
|
|
index b4d3e134d93..d9f55cc9f5d 100644
|
|
--- a/src/bin/scripts/createdb.c
|
|
+++ b/src/bin/scripts/createdb.c
|
|
@@ -190,6 +190,8 @@ main(int argc, char *argv[])
|
|
|
|
conn = connectMaintenanceDatabase(&cparams, progname, echo);
|
|
|
|
+ setFmtEncoding(PQclientEncoding(conn));
|
|
+
|
|
initPQExpBuffer(&sql);
|
|
|
|
appendPQExpBuffer(&sql, "CREATE DATABASE %s",
|
|
diff --git a/src/bin/scripts/createuser.c b/src/bin/scripts/createuser.c
|
|
index dbc2c2a58cd..7ec8ee51be7 100644
|
|
--- a/src/bin/scripts/createuser.c
|
|
+++ b/src/bin/scripts/createuser.c
|
|
@@ -271,6 +271,8 @@ main(int argc, char *argv[])
|
|
|
|
conn = connectMaintenanceDatabase(&cparams, progname, echo);
|
|
|
|
+ setFmtEncoding(PQclientEncoding(conn));
|
|
+
|
|
initPQExpBuffer(&sql);
|
|
|
|
printfPQExpBuffer(&sql, "CREATE ROLE %s", fmtId(newuser));
|
|
diff --git a/src/bin/scripts/dropdb.c b/src/bin/scripts/dropdb.c
|
|
index ffdf12bfea7..0d636d0ef46 100644
|
|
--- a/src/bin/scripts/dropdb.c
|
|
+++ b/src/bin/scripts/dropdb.c
|
|
@@ -125,7 +125,7 @@ main(int argc, char *argv[])
|
|
initPQExpBuffer(&sql);
|
|
|
|
appendPQExpBuffer(&sql, "DROP DATABASE %s%s;",
|
|
- (if_exists ? "IF EXISTS " : ""), fmtId(dbname));
|
|
+ (if_exists ? "IF EXISTS " : ""), fmtIdEnc(dbname, PQclientEncoding(conn)));
|
|
|
|
/* Avoid trying to drop postgres db while we are connected to it. */
|
|
if (maintenance_db == NULL && strcmp(dbname, "postgres") == 0)
|
|
@@ -140,6 +140,12 @@ main(int argc, char *argv[])
|
|
|
|
conn = connectMaintenanceDatabase(&cparams, progname, echo);
|
|
|
|
+ initPQExpBuffer(&sql);
|
|
+ appendPQExpBuffer(&sql, "DROP DATABASE %s%s%s;",
|
|
+ (if_exists ? "IF EXISTS " : ""),
|
|
+ fmtIdEnc(dbname, PQclientEncoding(conn)),
|
|
+ force ? " WITH (FORCE)" : "");
|
|
+
|
|
if (echo)
|
|
printf("%s\n", sql.data);
|
|
result = PQexec(conn, sql.data);
|
|
diff --git a/src/bin/scripts/dropuser.c b/src/bin/scripts/dropuser.c
|
|
index a8be6b0784b..26523f85784 100644
|
|
--- a/src/bin/scripts/dropuser.c
|
|
+++ b/src/bin/scripts/dropuser.c
|
|
@@ -143,7 +143,8 @@ main(int argc, char *argv[])
|
|
|
|
initPQExpBuffer(&sql);
|
|
appendPQExpBuffer(&sql, "DROP ROLE %s%s;",
|
|
- (if_exists ? "IF EXISTS " : ""), fmtId(dropuser));
|
|
+ (if_exists ? "IF EXISTS " : ""),
|
|
+ fmtIdEnc(dropuser, PQclientEncoding(conn)));
|
|
|
|
if (echo)
|
|
printf("%s\n", sql.data);
|
|
diff --git a/src/bin/scripts/reindexdb.c b/src/bin/scripts/reindexdb.c
|
|
index 39b4078b411..b96d0ff54cf 100644
|
|
--- a/src/bin/scripts/reindexdb.c
|
|
+++ b/src/bin/scripts/reindexdb.c
|
|
@@ -325,7 +325,7 @@ reindex_one_database(const ConnParams *cparams,
|
|
else if (strcmp(type, "SCHEMA") == 0)
|
|
appendPQExpBufferStr(&sql, name);
|
|
else if (strcmp(type, "DATABASE") == 0)
|
|
- appendPQExpBufferStr(&sql, fmtId(PQdb(conn)));
|
|
+ appendPQExpBufferStr(&sql, fmtIdEnc(PQdb(conn),PQclientEncoding(conn)));
|
|
appendPQExpBufferChar(&sql, ';');
|
|
|
|
if (!executeMaintenanceCommand(conn, sql.data, echo))
|
|
@@ -403,7 +403,7 @@ reindex_system_catalogs(const ConnParams *cparams,
|
|
appendPQExpBufferStr(&sql, " SYSTEM ");
|
|
if (concurrently)
|
|
appendPQExpBuffer(&sql, "CONCURRENTLY ");
|
|
- appendPQExpBufferStr(&sql, fmtId(PQdb(conn)));
|
|
+ appendPQExpBufferStr(&sql, fmtIdEnc(PQdb(conn),PQclientEncoding(conn)));
|
|
appendPQExpBufferChar(&sql, ';');
|
|
|
|
if (!executeMaintenanceCommand(conn, sql.data, echo))
|
|
diff --git a/src/bin/scripts/vacuumdb.c b/src/bin/scripts/vacuumdb.c
|
|
index 6ade0c31a9d..8f9ce6529dc 100644
|
|
--- a/src/bin/scripts/vacuumdb.c
|
|
+++ b/src/bin/scripts/vacuumdb.c
|
|
@@ -602,8 +602,9 @@ vacuum_one_database(const ConnParams *cparams,
|
|
for (i = 0; i < ntups; i++)
|
|
{
|
|
appendPQExpBufferStr(&buf,
|
|
- fmtQualifiedId(PQgetvalue(res, i, 1),
|
|
- PQgetvalue(res, i, 0)));
|
|
+ fmtQualifiedIdEnc(PQgetvalue(res, i, 1),
|
|
+ PQgetvalue(res, i, 0),
|
|
+ PQclientEncoding(conn)));
|
|
|
|
if (tables_listed && !PQgetisnull(res, i, 2))
|
|
appendPQExpBufferStr(&buf, PQgetvalue(res, i, 2));
|
|
diff --git a/src/fe_utils/string_utils.c b/src/fe_utils/string_utils.c
|
|
index d5757becef2..05f0bd2576d 100644
|
|
--- a/src/fe_utils/string_utils.c
|
|
+++ b/src/fe_utils/string_utils.c
|
|
@@ -18,6 +18,7 @@
|
|
#include <ctype.h>
|
|
|
|
#include "fe_utils/string_utils.h"
|
|
+#include "mb/pg_wchar.h"
|
|
|
|
#include "common/keywords.h"
|
|
|
|
@@ -28,6 +29,8 @@ static PQExpBuffer defaultGetLocalPQExpBuffer(void);
|
|
int quote_all_identifiers = 0;
|
|
PQExpBuffer (*getLocalPQExpBuffer) (void) = defaultGetLocalPQExpBuffer;
|
|
|
|
+static int fmtIdEncoding = -1;
|
|
+
|
|
|
|
/*
|
|
* Returns a temporary PQExpBuffer, valid until the next call to the function.
|
|
@@ -56,14 +59,48 @@ defaultGetLocalPQExpBuffer(void)
|
|
return id_return;
|
|
}
|
|
|
|
+/*
|
|
+ * Set the encoding that fmtId() and fmtQualifiedId() use.
|
|
+ *
|
|
+ * This is not safe against multiple connections having different encodings,
|
|
+ * but there is no real other way to address the need to know the encoding for
|
|
+ * fmtId()/fmtQualifiedId() input for safe escaping. Eventually we should get
|
|
+ * rid of fmtId().
|
|
+ */
|
|
+void
|
|
+setFmtEncoding(int encoding)
|
|
+{
|
|
+ fmtIdEncoding = encoding;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Return the currently configured encoding for fmtId() and fmtQualifiedId().
|
|
+ */
|
|
+static int
|
|
+getFmtEncoding(void)
|
|
+{
|
|
+ if (fmtIdEncoding != -1)
|
|
+ return fmtIdEncoding;
|
|
+
|
|
+ /*
|
|
+ * In assertion builds it seems best to fail hard if the encoding was not
|
|
+ * set, to make it easier to find places with missing calls. But in
|
|
+ * production builds that seems like a bad idea, thus we instead just
|
|
+ * default to UTF-8.
|
|
+ */
|
|
+ Assert(fmtIdEncoding != -1);
|
|
+
|
|
+ return PG_UTF8;
|
|
+}
|
|
+
|
|
/*
|
|
* Quotes input string if it's not a legitimate SQL identifier as-is.
|
|
*
|
|
- * Note that the returned string must be used before calling fmtId again,
|
|
+ * Note that the returned string must be used before calling fmtIdEnc again,
|
|
* since we re-use the same return buffer each time.
|
|
*/
|
|
const char *
|
|
-fmtId(const char *rawid)
|
|
+fmtIdEnc(const char *rawid, int encoding)
|
|
{
|
|
PQExpBuffer id_return = getLocalPQExpBuffer();
|
|
|
|
@@ -136,7 +173,24 @@ fmtId(const char *rawid)
|
|
}
|
|
|
|
/*
|
|
- * fmtQualifiedId - construct a schema-qualified name, with quoting as needed.
|
|
+ * Quotes input string if it's not a legitimate SQL identifier as-is.
|
|
+ *
|
|
+ * Note that the returned string must be used before calling fmtId again,
|
|
+ * since we re-use the same return buffer each time.
|
|
+ *
|
|
+ * NB: This assumes setFmtEncoding() previously has been called to configure
|
|
+ * the encoding of rawid. It is preferable to use fmtIdEnc() with an
|
|
+ * explicit encoding.
|
|
+ */
|
|
+const char *
|
|
+fmtId(const char *rawid)
|
|
+{
|
|
+ return fmtIdEnc(rawid, getFmtEncoding());
|
|
+}
|
|
+
|
|
+/*
|
|
+ * fmtQualifiedIdEnc - construct a schema-qualified name, with quoting as
|
|
+ * needed.
|
|
*
|
|
* Like fmtId, use the result before calling again.
|
|
*
|
|
@@ -144,7 +198,7 @@ fmtId(const char *rawid)
|
|
* use that buffer until we're finished with calling fmtId().
|
|
*/
|
|
const char *
|
|
-fmtQualifiedId(const char *schema, const char *id)
|
|
+fmtQualifiedIdEnc(const char *schema, const char *id, int encoding)
|
|
{
|
|
PQExpBuffer id_return;
|
|
PQExpBuffer lcl_pqexp = createPQExpBuffer();
|
|
@@ -152,9 +206,9 @@ fmtQualifiedId(const char *schema, const char *id)
|
|
/* Some callers might fail to provide a schema name */
|
|
if (schema && *schema)
|
|
{
|
|
- appendPQExpBuffer(lcl_pqexp, "%s.", fmtId(schema));
|
|
+ appendPQExpBuffer(lcl_pqexp, "%s.", fmtIdEnc(schema, encoding));
|
|
}
|
|
- appendPQExpBufferStr(lcl_pqexp, fmtId(id));
|
|
+ appendPQExpBufferStr(lcl_pqexp, fmtIdEnc(id, encoding));
|
|
|
|
id_return = getLocalPQExpBuffer();
|
|
|
|
@@ -164,6 +218,24 @@ fmtQualifiedId(const char *schema, const char *id)
|
|
return id_return->data;
|
|
}
|
|
|
|
+/*
|
|
+ * fmtQualifiedId - construct a schema-qualified name, with quoting as needed.
|
|
+ *
|
|
+ * Like fmtId, use the result before calling again.
|
|
+ *
|
|
+ * Since we call fmtId and it also uses getLocalPQExpBuffer() we cannot
|
|
+ * use that buffer until we're finished with calling fmtId().
|
|
+ *
|
|
+ * NB: This assumes setFmtEncoding() previously has been called to configure
|
|
+ * the encoding of schema/id. It is preferable to use fmtQualifiedIdEnc()
|
|
+ * with an explicit encoding.
|
|
+ */
|
|
+const char *
|
|
+fmtQualifiedId(const char *schema, const char *id)
|
|
+{
|
|
+ return fmtQualifiedIdEnc(schema, id, getFmtEncoding());
|
|
+}
|
|
+
|
|
|
|
/*
|
|
* Format a Postgres version number (in the PG_VERSION_NUM integer format
|
|
diff --git a/src/include/fe_utils/string_utils.h b/src/include/fe_utils/string_utils.h
|
|
index 8c13cc0a66d..37f17f0b370 100644
|
|
--- a/src/include/fe_utils/string_utils.h
|
|
+++ b/src/include/fe_utils/string_utils.h
|
|
@@ -24,8 +24,11 @@ extern int quote_all_identifiers;
|
|
extern PQExpBuffer (*getLocalPQExpBuffer) (void);
|
|
|
|
/* Functions */
|
|
-extern const char *fmtId(const char *identifier);
|
|
+extern const char *fmtId(const char *rawid);
|
|
+extern const char *fmtIdEnc(const char *rawid, int encoding);
|
|
extern const char *fmtQualifiedId(const char *schema, const char *id);
|
|
+extern const char *fmtQualifiedIdEnc(const char *schema, const char *id, int encoding);
|
|
+extern void setFmtEncoding(int encoding);
|
|
|
|
extern char *formatPGVersionNumber(int version_number, bool include_minor,
|
|
char *buf, size_t buflen);
|
|
--
|
|
2.39.5 (Apple Git-154)
|
|
|
|
|
|
From 7c56df18c1f6e48c4343f2d6d1364c5825e45278 Mon Sep 17 00:00:00 2001
|
|
From: Andres Freund <andres@anarazel.de>
|
|
Date: Mon, 10 Feb 2025 10:03:37 -0500
|
|
Subject: [PATCH 3/8] Backport upstream commit:
|
|
5dc1e42b4fa6a4434afa7d7cdcf0291351a7b873 Fix handling of invalidly encoded
|
|
data in escaping functions
|
|
|
|
Previously invalidly encoded input to various escaping functions could lead to
|
|
the escaped string getting incorrectly parsed by psql. To be safe, escaping
|
|
functions need to ensure that neither invalid nor incomplete multi-byte
|
|
characters can be used to "escape" from being quoted.
|
|
|
|
Functions which can report errors now return an error in more cases than
|
|
before. Functions that cannot report errors now replace invalid input bytes
|
|
with a byte sequence that cannot be used to escape the quotes and that is
|
|
guaranteed to error out when a query is sent to the server.
|
|
|
|
The following functions are fixed by this commit:
|
|
- PQescapeLiteral()
|
|
- PQescapeIdentifier()
|
|
- PQescapeString()
|
|
- PQescapeStringConn()
|
|
- fmtId()
|
|
- appendStringLiteral()
|
|
---
|
|
src/fe_utils/string_utils.c | 170 ++++++++++++++++++++++++++-------
|
|
src/interfaces/libpq/fe-exec.c | 114 ++++++++++++++--------
|
|
2 files changed, 212 insertions(+), 72 deletions(-)
|
|
|
|
diff --git a/src/fe_utils/string_utils.c b/src/fe_utils/string_utils.c
|
|
index 05f0bd2576d..9f7151bd542 100644
|
|
--- a/src/fe_utils/string_utils.c
|
|
+++ b/src/fe_utils/string_utils.c
|
|
@@ -106,6 +106,7 @@ fmtIdEnc(const char *rawid, int encoding)
|
|
|
|
const char *cp;
|
|
bool need_quotes = false;
|
|
+ size_t remaining = strlen(rawid);
|
|
|
|
/*
|
|
* These checks need to match the identifier production in scan.l. Don't
|
|
@@ -119,7 +120,8 @@ fmtIdEnc(const char *rawid, int encoding)
|
|
else
|
|
{
|
|
/* otherwise check the entire string */
|
|
- for (cp = rawid; *cp; cp++)
|
|
+ cp = rawid;
|
|
+ for (size_t i = 0; i < remaining; i++, cp++)
|
|
{
|
|
if (!((*cp >= 'a' && *cp <= 'z')
|
|
|| (*cp >= '0' && *cp <= '9')
|
|
@@ -155,17 +157,90 @@ fmtIdEnc(const char *rawid, int encoding)
|
|
else
|
|
{
|
|
appendPQExpBufferChar(id_return, '"');
|
|
- for (cp = rawid; *cp; cp++)
|
|
+
|
|
+ cp = &rawid[0];
|
|
+ while (remaining > 0)
|
|
{
|
|
- /*
|
|
- * Did we find a double-quote in the string? Then make this a
|
|
- * double double-quote per SQL99. Before, we put in a
|
|
- * backslash/double-quote pair. - thomas 2000-08-05
|
|
- */
|
|
- if (*cp == '"')
|
|
- appendPQExpBufferChar(id_return, '"');
|
|
- appendPQExpBufferChar(id_return, *cp);
|
|
+ int charlen;
|
|
+
|
|
+ /* Fast path for plain ASCII */
|
|
+ if (!IS_HIGHBIT_SET(*cp))
|
|
+ {
|
|
+ /*
|
|
+ * Did we find a double-quote in the string? Then make this a
|
|
+ * double double-quote per SQL99. Before, we put in a
|
|
+ * backslash/double-quote pair. - thomas 2000-08-05
|
|
+ */
|
|
+ if (*cp == '"')
|
|
+ appendPQExpBufferChar(id_return, '"');
|
|
+ appendPQExpBufferChar(id_return, *cp);
|
|
+ remaining--;
|
|
+ cp++;
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ /* Slow path for possible multibyte characters */
|
|
+ charlen = pg_encoding_mblen(encoding, cp);
|
|
+
|
|
+ if (remaining < charlen)
|
|
+ {
|
|
+ /*
|
|
+ * If the character is longer than the available input,
|
|
+ * replace the string with an invalid sequence. The invalid
|
|
+ * sequence ensures that the escaped string will trigger an
|
|
+ * error on the server-side, even if we can't directly report
|
|
+ * an error here.
|
|
+ */
|
|
+ enlargePQExpBuffer(id_return, 2);
|
|
+ pg_encoding_set_invalid(encoding,
|
|
+ id_return->data + id_return->len);
|
|
+ id_return->len += 2;
|
|
+ id_return->data[id_return->len] = '\0';
|
|
+
|
|
+ /* there's no more input data, so we can stop */
|
|
+ break;
|
|
+ }
|
|
+ else if (pg_encoding_verifymbchar(encoding, cp, charlen) == -1)
|
|
+ {
|
|
+ /*
|
|
+ * Multibyte character is invalid. It's important to verify
|
|
+ * that as invalid multi-byte characters could e.g. be used to
|
|
+ * "skip" over quote characters, e.g. when parsing
|
|
+ * character-by-character.
|
|
+ *
|
|
+ * Replace the bytes corresponding to the invalid character
|
|
+ * with an invalid sequence, for the same reason as above.
|
|
+ *
|
|
+ * It would be a bit faster to verify the whole string the
|
|
+ * first time we encounter a set highbit, but this way we can
|
|
+ * replace just the invalid characters, which probably makes
|
|
+ * it easier for users to find the invalidly encoded portion
|
|
+ * of a larger string.
|
|
+ */
|
|
+ enlargePQExpBuffer(id_return, 2);
|
|
+ pg_encoding_set_invalid(encoding,
|
|
+ id_return->data + id_return->len);
|
|
+ id_return->len += 2;
|
|
+ id_return->data[id_return->len] = '\0';
|
|
+
|
|
+ /*
|
|
+ * Copy the rest of the string after the invalid multi-byte
|
|
+ * character.
|
|
+ */
|
|
+ remaining -= charlen;
|
|
+ cp += charlen;
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ for (int i = 0; i < charlen; i++)
|
|
+ {
|
|
+ appendPQExpBufferChar(id_return, *cp);
|
|
+ remaining--;
|
|
+ cp++;
|
|
+ }
|
|
+ }
|
|
}
|
|
+
|
|
appendPQExpBufferChar(id_return, '"');
|
|
}
|
|
|
|
@@ -292,6 +367,7 @@ appendStringLiteral(PQExpBuffer buf, const char *str,
|
|
size_t length = strlen(str);
|
|
const char *source = str;
|
|
char *target;
|
|
+ size_t remaining = length;
|
|
|
|
if (!enlargePQExpBuffer(buf, 2 * length + 2))
|
|
return;
|
|
@@ -299,10 +375,10 @@ appendStringLiteral(PQExpBuffer buf, const char *str,
|
|
target = buf->data + buf->len;
|
|
*target++ = '\'';
|
|
|
|
- while (*source != '\0')
|
|
+ while (remaining > 0)
|
|
{
|
|
char c = *source;
|
|
- int len;
|
|
+ int charlen;
|
|
int i;
|
|
|
|
/* Fast path for plain ASCII */
|
|
@@ -314,39 +390,65 @@ appendStringLiteral(PQExpBuffer buf, const char *str,
|
|
/* Copy the character */
|
|
*target++ = c;
|
|
source++;
|
|
+ remaining--;
|
|
continue;
|
|
}
|
|
|
|
/* Slow path for possible multibyte characters */
|
|
- len = PQmblen(source, encoding);
|
|
+ charlen = PQmblen(source, encoding);
|
|
|
|
- /* Copy the character */
|
|
- for (i = 0; i < len; i++)
|
|
+ if (remaining < charlen)
|
|
{
|
|
- if (*source == '\0')
|
|
- break;
|
|
- *target++ = *source++;
|
|
- }
|
|
+ /*
|
|
+ * If the character is longer than the available input, replace
|
|
+ * the string with an invalid sequence. The invalid sequence
|
|
+ * ensures that the escaped string will trigger an error on the
|
|
+ * server-side, even if we can't directly report an error here.
|
|
+ *
|
|
+ * We know there's enough space for the invalid sequence because
|
|
+ * the "target" buffer is 2 * length + 2 long, and at worst we're
|
|
+ * replacing a single input byte with two invalid bytes.
|
|
+ */
|
|
+ pg_encoding_set_invalid(encoding, target);
|
|
+ target += 2;
|
|
|
|
- /*
|
|
- * If we hit premature end of string (ie, incomplete multibyte
|
|
- * character), try to pad out to the correct length with spaces. We
|
|
- * may not be able to pad completely, but we will always be able to
|
|
- * insert at least one pad space (since we'd not have quoted a
|
|
- * multibyte character). This should be enough to make a string that
|
|
- * the server will error out on.
|
|
- */
|
|
- if (i < len)
|
|
+ /* there's no more valid input data, so we can stop */
|
|
+ break;
|
|
+ }
|
|
+ else if (pg_encoding_verifymbchar(encoding, source, charlen) == -1)
|
|
{
|
|
- char *stop = buf->data + buf->maxlen - 2;
|
|
+ /*
|
|
+ * Multibyte character is invalid. It's important to verify that
|
|
+ * as invalid multi-byte characters could e.g. be used to "skip"
|
|
+ * over quote characters, e.g. when parsing
|
|
+ * character-by-character.
|
|
+ *
|
|
+ * Replace the bytes corresponding to the invalid character with
|
|
+ * an invalid sequence, for the same reason as above.
|
|
+ *
|
|
+ * It would be a bit faster to verify the whole string the first
|
|
+ * time we encounter a set highbit, but this way we can replace
|
|
+ * just the invalid characters, which probably makes it easier for
|
|
+ * users to find the invalidly encoded portion of a larger string.
|
|
+ */
|
|
+ pg_encoding_set_invalid(encoding, target);
|
|
+ target += 2;
|
|
+ remaining -= charlen;
|
|
|
|
- for (; i < len; i++)
|
|
+ /*
|
|
+ * Copy the rest of the string after the invalid multi-byte
|
|
+ * character.
|
|
+ */
|
|
+ source += charlen;
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ /* Copy the character */
|
|
+ for (i = 0; i < charlen; i++)
|
|
{
|
|
- if (target >= stop)
|
|
- break;
|
|
- *target++ = ' ';
|
|
+ *target++ = *source++;
|
|
+ remaining--;
|
|
}
|
|
- break;
|
|
}
|
|
}
|
|
|
|
diff --git a/src/interfaces/libpq/fe-exec.c b/src/interfaces/libpq/fe-exec.c
|
|
index ff101c4ca2a..97cd2c53673 100644
|
|
--- a/src/interfaces/libpq/fe-exec.c
|
|
+++ b/src/interfaces/libpq/fe-exec.c
|
|
@@ -3348,15 +3348,15 @@ PQescapeStringInternal(PGconn *conn,
|
|
{
|
|
const char *source = from;
|
|
char *target = to;
|
|
- size_t remaining = length;
|
|
+ size_t remaining = strnlen(from, length);
|
|
|
|
if (error)
|
|
*error = 0;
|
|
|
|
- while (remaining > 0 && *source != '\0')
|
|
+ while (remaining > 0)
|
|
{
|
|
char c = *source;
|
|
- int len;
|
|
+ int charlen;
|
|
int i;
|
|
|
|
/* Fast path for plain ASCII */
|
|
@@ -3373,39 +3373,48 @@ PQescapeStringInternal(PGconn *conn,
|
|
}
|
|
|
|
/* Slow path for possible multibyte characters */
|
|
- len = pg_encoding_mblen(encoding, source);
|
|
+ charlen = pg_encoding_mblen(encoding, source);
|
|
|
|
- /* Copy the character */
|
|
- for (i = 0; i < len; i++)
|
|
- {
|
|
- if (remaining == 0 || *source == '\0')
|
|
- break;
|
|
- *target++ = *source++;
|
|
- remaining--;
|
|
- }
|
|
-
|
|
- /*
|
|
- * If we hit premature end of string (ie, incomplete multibyte
|
|
- * character), try to pad out to the correct length with spaces. We
|
|
- * may not be able to pad completely, but we will always be able to
|
|
- * insert at least one pad space (since we'd not have quoted a
|
|
- * multibyte character). This should be enough to make a string that
|
|
- * the server will error out on.
|
|
- */
|
|
- if (i < len)
|
|
+ if (remaining < charlen ||
|
|
+ pg_encoding_verifymbchar(encoding, source, charlen) == -1)
|
|
{
|
|
+ /*
|
|
+ * If the character is longer than the available input, report an
|
|
+ * error if possible, and replace the string with an invalid
|
|
+ * sequence. The invalid sequence ensures that the escaped string
|
|
+ * will trigger an error on the server-side, even if we can't
|
|
+ * directly report an error here.
|
|
+ *
|
|
+ * This isn't *that* crucial when we can report an error to the
|
|
+ * caller, but if we can't, the caller will use this string
|
|
+ * unmodified and it needs to be safe for parsing.
|
|
+ *
|
|
+ * We know there's enough space for the invalid sequence because
|
|
+ * the "to" buffer needs to be at least 2 * length + 1 long, and
|
|
+ * at worst we're replacing a single input byte with two invalid
|
|
+ * bytes.
|
|
+ */
|
|
if (error)
|
|
*error = 1;
|
|
if (conn)
|
|
- printfPQExpBuffer(&conn->errorMessage,
|
|
- libpq_gettext("incomplete multibyte character\n"));
|
|
- for (; i < len; i++)
|
|
+ libpq_append_conn_error(conn, "incomplete multibyte character");
|
|
+
|
|
+ pg_encoding_set_invalid(encoding, target);
|
|
+ target += 2;
|
|
+ source++;
|
|
+ remaining--;
|
|
+
|
|
+ /* there's no more input data, so we can stop */
|
|
+ break;
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ /* Copy the character */
|
|
+ for (i = 0; i < charlen; i++)
|
|
{
|
|
- if (((size_t) (target - to)) / 2 >= length)
|
|
- break;
|
|
- *target++ = ' ';
|
|
+ *target++ = *source++;
|
|
+ remaining--;
|
|
}
|
|
- break;
|
|
}
|
|
}
|
|
|
|
@@ -3451,21 +3460,27 @@ PQescapeString(char *to, const char *from, size_t length)
|
|
static char *
|
|
PQescapeInternal(PGconn *conn, const char *str, size_t len, bool as_ident)
|
|
{
|
|
- const char *s;
|
|
+ const char *s;
|
|
char *result;
|
|
char *rp;
|
|
int num_quotes = 0; /* single or double, depending on as_ident */
|
|
int num_backslashes = 0;
|
|
- int input_len;
|
|
- int result_size;
|
|
+ size_t input_len = strnlen(str, len);
|
|
+ size_t result_size;
|
|
char quote_char = as_ident ? '"' : '\'';
|
|
+ bool validated_mb = false;
|
|
|
|
/* We must have a connection, else fail immediately. */
|
|
if (!conn)
|
|
return NULL;
|
|
|
|
- /* Scan the string for characters that must be escaped. */
|
|
- for (s = str; (s - str) < len && *s != '\0'; ++s)
|
|
+
|
|
+ /*
|
|
+ * Scan the string for characters that must be escaped and for invalidly
|
|
+ * encoded data.
|
|
+ */
|
|
+ s = str;
|
|
+ for (size_t remaining = input_len; remaining > 0; remaining--, s++)
|
|
{
|
|
if (*s == quote_char)
|
|
++num_quotes;
|
|
@@ -3478,21 +3493,42 @@ PQescapeInternal(PGconn *conn, const char *str, size_t len, bool as_ident)
|
|
/* Slow path for possible multibyte characters */
|
|
charlen = pg_encoding_mblen(conn->client_encoding, s);
|
|
|
|
- /* Multibyte character overruns allowable length. */
|
|
- if ((s - str) + charlen > len || memchr(s, 0, charlen) != NULL)
|
|
+ if (charlen > remaining)
|
|
{
|
|
printfPQExpBuffer(&conn->errorMessage,
|
|
libpq_gettext("incomplete multibyte character\n"));
|
|
return NULL;
|
|
}
|
|
|
|
+ /*
|
|
+ * If we haven't already, check that multibyte characters are
|
|
+ * valid. It's important to verify that as invalid multi-byte
|
|
+ * characters could e.g. be used to "skip" over quote characters,
|
|
+ * e.g. when parsing character-by-character.
|
|
+ *
|
|
+ * We check validity once, for the whole remainder of the string,
|
|
+ * when we first encounter any multi-byte character. Some
|
|
+ * encodings have optimized implementations for longer strings.
|
|
+ */
|
|
+ if (!validated_mb)
|
|
+ {
|
|
+ if (pg_encoding_verifymbstr(conn->client_encoding, s, remaining)
|
|
+ != remaining)
|
|
+ {
|
|
+ printfPQExpBuffer(&conn->errorMessage,
|
|
+ libpq_gettext("invalid multibyte character\n"));
|
|
+ return NULL;
|
|
+ }
|
|
+ validated_mb = true;
|
|
+ }
|
|
+
|
|
/* Adjust s, bearing in mind that for loop will increment it. */
|
|
s += charlen - 1;
|
|
+ remaining -= charlen - 1;
|
|
}
|
|
}
|
|
|
|
/* Allocate output buffer. */
|
|
- input_len = s - str;
|
|
result_size = input_len + num_quotes + 3; /* two quotes, plus a NUL */
|
|
if (!as_ident && num_backslashes > 0)
|
|
result_size += num_backslashes + 2;
|
|
@@ -3538,7 +3574,8 @@ PQescapeInternal(PGconn *conn, const char *str, size_t len, bool as_ident)
|
|
}
|
|
else
|
|
{
|
|
- for (s = str; s - str < input_len; ++s)
|
|
+ s = str;
|
|
+ for (size_t remaining = input_len; remaining > 0; remaining--, s++)
|
|
{
|
|
if (*s == quote_char || (!as_ident && *s == '\\'))
|
|
{
|
|
@@ -3556,6 +3593,7 @@ PQescapeInternal(PGconn *conn, const char *str, size_t len, bool as_ident)
|
|
*rp++ = *s;
|
|
if (--i == 0)
|
|
break;
|
|
+ remaining--;
|
|
++s; /* for loop will provide the final increment */
|
|
}
|
|
}
|
|
--
|
|
2.39.5 (Apple Git-154)
|
|
|
|
|
|
From 3751ccde18122412fcbfcc2df583cf66fefdbab0 Mon Sep 17 00:00:00 2001
|
|
From: Tom Lane <tgl@sss.pgh.pa.us>
|
|
Date: Mon, 10 Feb 2025 16:30:03 -0500
|
|
Subject: [PATCH 4/8] Backport upstream commit
|
|
5bf12323b6b8b05790aab6876555568898f4fc81 Adapt appendPsqlMetaConnect() to the
|
|
new fmtId() encoding expectations.
|
|
|
|
We need to tell fmtId() what encoding to assume, but this function
|
|
doesn't know that. Fortunately we can fix that without changing the
|
|
function's API, because we can just use SQL_ASCII. That's because
|
|
database names in connection requests are effectively binary not text:
|
|
no encoding-aware processing will happen on them.
|
|
|
|
This fixes XversionUpgrade failures seen in the buildfarm. The
|
|
alternative of having pg_upgrade use setFmtEncoding() is unappetizing,
|
|
given that it's connecting to multiple databases that may have
|
|
different encodings.
|
|
|
|
Andres Freund, Noah Misch, Tom Lane
|
|
|
|
Security: CVE-2025-1094
|
|
---
|
|
src/fe_utils/string_utils.c | 21 +++++++++++++++------
|
|
1 file changed, 15 insertions(+), 6 deletions(-)
|
|
|
|
diff --git a/src/fe_utils/string_utils.c b/src/fe_utils/string_utils.c
|
|
index 9f7151bd542..a289d3001eb 100644
|
|
--- a/src/fe_utils/string_utils.c
|
|
+++ b/src/fe_utils/string_utils.c
|
|
@@ -792,16 +792,22 @@ appendPsqlMetaConnect(PQExpBuffer buf, const char *dbname)
|
|
}
|
|
}
|
|
|
|
- appendPQExpBufferStr(buf, "\\connect ");
|
|
if (complex)
|
|
{
|
|
PQExpBufferData connstr;
|
|
|
|
initPQExpBuffer(&connstr);
|
|
- appendPQExpBuffer(&connstr, "dbname=");
|
|
- appendConnStrVal(&connstr, dbname);
|
|
|
|
- appendPQExpBuffer(buf, "-reuse-previous=on ");
|
|
+ /*
|
|
+ * Force the target psql's encoding to SQL_ASCII. We don't really
|
|
+ * know the encoding of the database name, and it doesn't matter as
|
|
+ * long as psql will forward it to the server unchanged.
|
|
+ */
|
|
+ appendPQExpBufferStr(buf, "\\encoding SQL_ASCII\n");
|
|
+ appendPQExpBufferStr(buf, "\\connect -reuse-previous=on ");
|
|
+
|
|
+ appendPQExpBufferStr(&connstr, "dbname=");
|
|
+ appendConnStrVal(&connstr, dbname);
|
|
|
|
/*
|
|
* As long as the name does not contain a newline, SQL identifier
|
|
@@ -809,12 +815,15 @@ appendPsqlMetaConnect(PQExpBuffer buf, const char *dbname)
|
|
* involve psql-interpreted single quotes, which behaved differently
|
|
* before PostgreSQL 9.2.
|
|
*/
|
|
- appendPQExpBufferStr(buf, fmtId(connstr.data));
|
|
+ appendPQExpBufferStr(buf, fmtIdEnc(connstr.data, PG_SQL_ASCII));
|
|
|
|
termPQExpBuffer(&connstr);
|
|
}
|
|
else
|
|
- appendPQExpBufferStr(buf, fmtId(dbname));
|
|
+ {
|
|
+ appendPQExpBufferStr(buf, "\\connect ");
|
|
+ appendPQExpBufferStr(buf, fmtIdEnc(dbname, PG_SQL_ASCII));
|
|
+ }
|
|
appendPQExpBufferChar(buf, '\n');
|
|
}
|
|
|
|
--
|
|
2.39.5 (Apple Git-154)
|
|
|
|
|
|
From 84b7b93568fa4523afb66d2d1776f5e24b5db1de Mon Sep 17 00:00:00 2001
|
|
From: Tom Lane <tgl@sss.pgh.pa.us>
|
|
Date: Sat, 15 Feb 2025 16:20:21 -0500
|
|
Subject: [PATCH 5/8] Backport upstream commit:
|
|
9f45e6a91d8460ac0b1f30e6ae3eefb185b8d0ab Make escaping functions retain
|
|
trailing bytes of an invalid character.
|
|
|
|
Instead of dropping the trailing byte(s) of an invalid or incomplete
|
|
multibyte character, replace only the first byte with a known-invalid
|
|
sequence, and process the rest normally. This seems less likely to
|
|
confuse incautious callers than the behavior adopted in 5dc1e42b4.
|
|
|
|
While we're at it, adjust PQescapeStringInternal to produce at most
|
|
one bleat about invalid multibyte characters per string. This
|
|
matches the behavior of PQescapeInternal, and avoids the risk of
|
|
producing tons of repetitive junk if a long string is simply given
|
|
in the wrong encoding.
|
|
|
|
This is a followup to the fixes for CVE-2025-1094, and should be
|
|
included if cherry-picking those fixes.
|
|
|
|
Author: Andres Freund <andres@anarazel.de>
|
|
Co-authored-by: Tom Lane <tgl@sss.pgh.pa.us>
|
|
Reported-by: Jeff Davis <pgsql@j-davis.com>
|
|
Discussion: https://postgr.es/m/20250215012712.45@rfd.leadboat.com
|
|
---
|
|
src/fe_utils/string_utils.c | 91 +++++++++++++---------------------
|
|
src/interfaces/libpq/fe-exec.c | 22 ++++----
|
|
2 files changed, 47 insertions(+), 66 deletions(-)
|
|
|
|
diff --git a/src/fe_utils/string_utils.c b/src/fe_utils/string_utils.c
|
|
index a289d3001eb..a2d5ccd1e28 100644
|
|
--- a/src/fe_utils/string_utils.c
|
|
+++ b/src/fe_utils/string_utils.c
|
|
@@ -182,40 +182,25 @@ fmtIdEnc(const char *rawid, int encoding)
|
|
/* Slow path for possible multibyte characters */
|
|
charlen = pg_encoding_mblen(encoding, cp);
|
|
|
|
- if (remaining < charlen)
|
|
- {
|
|
- /*
|
|
- * If the character is longer than the available input,
|
|
- * replace the string with an invalid sequence. The invalid
|
|
- * sequence ensures that the escaped string will trigger an
|
|
- * error on the server-side, even if we can't directly report
|
|
- * an error here.
|
|
- */
|
|
- enlargePQExpBuffer(id_return, 2);
|
|
- pg_encoding_set_invalid(encoding,
|
|
- id_return->data + id_return->len);
|
|
- id_return->len += 2;
|
|
- id_return->data[id_return->len] = '\0';
|
|
-
|
|
- /* there's no more input data, so we can stop */
|
|
- break;
|
|
- }
|
|
- else if (pg_encoding_verifymbchar(encoding, cp, charlen) == -1)
|
|
+ if (remaining < charlen ||
|
|
+ pg_encoding_verifymbchar(encoding, cp, charlen) == -1)
|
|
{
|
|
/*
|
|
* Multibyte character is invalid. It's important to verify
|
|
- * that as invalid multi-byte characters could e.g. be used to
|
|
+ * that as invalid multibyte characters could e.g. be used to
|
|
* "skip" over quote characters, e.g. when parsing
|
|
* character-by-character.
|
|
*
|
|
- * Replace the bytes corresponding to the invalid character
|
|
- * with an invalid sequence, for the same reason as above.
|
|
+ * Replace the character's first byte with an invalid
|
|
+ * sequence. The invalid sequence ensures that the escaped
|
|
+ * string will trigger an error on the server-side, even if we
|
|
+ * can't directly report an error here.
|
|
*
|
|
* It would be a bit faster to verify the whole string the
|
|
* first time we encounter a set highbit, but this way we can
|
|
- * replace just the invalid characters, which probably makes
|
|
- * it easier for users to find the invalidly encoded portion
|
|
- * of a larger string.
|
|
+ * replace just the invalid data, which probably makes it
|
|
+ * easier for users to find the invalidly encoded portion of a
|
|
+ * larger string.
|
|
*/
|
|
enlargePQExpBuffer(id_return, 2);
|
|
pg_encoding_set_invalid(encoding,
|
|
@@ -224,11 +209,13 @@ fmtIdEnc(const char *rawid, int encoding)
|
|
id_return->data[id_return->len] = '\0';
|
|
|
|
/*
|
|
- * Copy the rest of the string after the invalid multi-byte
|
|
- * character.
|
|
+ * Handle the following bytes as if this byte didn't exist.
|
|
+ * That's safer in case the subsequent bytes contain
|
|
+ * characters that are significant for the caller (e.g. '>' in
|
|
+ * html).
|
|
*/
|
|
- remaining -= charlen;
|
|
- cp += charlen;
|
|
+ remaining--;
|
|
+ cp++;
|
|
}
|
|
else
|
|
{
|
|
@@ -397,49 +384,39 @@ appendStringLiteral(PQExpBuffer buf, const char *str,
|
|
/* Slow path for possible multibyte characters */
|
|
charlen = PQmblen(source, encoding);
|
|
|
|
- if (remaining < charlen)
|
|
- {
|
|
- /*
|
|
- * If the character is longer than the available input, replace
|
|
- * the string with an invalid sequence. The invalid sequence
|
|
- * ensures that the escaped string will trigger an error on the
|
|
- * server-side, even if we can't directly report an error here.
|
|
- *
|
|
- * We know there's enough space for the invalid sequence because
|
|
- * the "target" buffer is 2 * length + 2 long, and at worst we're
|
|
- * replacing a single input byte with two invalid bytes.
|
|
- */
|
|
- pg_encoding_set_invalid(encoding, target);
|
|
- target += 2;
|
|
-
|
|
- /* there's no more valid input data, so we can stop */
|
|
- break;
|
|
- }
|
|
- else if (pg_encoding_verifymbchar(encoding, source, charlen) == -1)
|
|
+ if (remaining < charlen ||
|
|
+ pg_encoding_verifymbchar(encoding, source, charlen) == -1)
|
|
{
|
|
/*
|
|
* Multibyte character is invalid. It's important to verify that
|
|
- * as invalid multi-byte characters could e.g. be used to "skip"
|
|
+ * as invalid multibyte characters could e.g. be used to "skip"
|
|
* over quote characters, e.g. when parsing
|
|
* character-by-character.
|
|
*
|
|
- * Replace the bytes corresponding to the invalid character with
|
|
- * an invalid sequence, for the same reason as above.
|
|
+ * Replace the character's first byte with an invalid sequence.
|
|
+ * The invalid sequence ensures that the escaped string will
|
|
+ * trigger an error on the server-side, even if we can't directly
|
|
+ * report an error here.
|
|
+ *
|
|
+ * We know there's enough space for the invalid sequence because
|
|
+ * the "target" buffer is 2 * length + 2 long, and at worst we're
|
|
+ * replacing a single input byte with two invalid bytes.
|
|
*
|
|
* It would be a bit faster to verify the whole string the first
|
|
* time we encounter a set highbit, but this way we can replace
|
|
- * just the invalid characters, which probably makes it easier for
|
|
- * users to find the invalidly encoded portion of a larger string.
|
|
+ * just the invalid data, which probably makes it easier for users
|
|
+ * to find the invalidly encoded portion of a larger string.
|
|
*/
|
|
pg_encoding_set_invalid(encoding, target);
|
|
target += 2;
|
|
- remaining -= charlen;
|
|
|
|
/*
|
|
- * Copy the rest of the string after the invalid multi-byte
|
|
- * character.
|
|
+ * Handle the following bytes as if this byte didn't exist. That's
|
|
+ * safer in case the subsequent bytes contain important characters
|
|
+ * for the caller (e.g. '>' in html).
|
|
*/
|
|
- source += charlen;
|
|
+ source++;
|
|
+ remaining--;
|
|
}
|
|
else
|
|
{
|
|
diff --git a/src/interfaces/libpq/fe-exec.c b/src/interfaces/libpq/fe-exec.c
|
|
index 97cd2c53673..a29d19a6268 100644
|
|
--- a/src/interfaces/libpq/fe-exec.c
|
|
+++ b/src/interfaces/libpq/fe-exec.c
|
|
@@ -3349,6 +3349,7 @@ PQescapeStringInternal(PGconn *conn,
|
|
const char *source = from;
|
|
char *target = to;
|
|
size_t remaining = strnlen(from, length);
|
|
+ bool already_complained = false;
|
|
|
|
if (error)
|
|
*error = 0;
|
|
@@ -3379,15 +3380,20 @@ PQescapeStringInternal(PGconn *conn,
|
|
pg_encoding_verifymbchar(encoding, source, charlen) == -1)
|
|
{
|
|
/*
|
|
- * If the character is longer than the available input, report an
|
|
- * error if possible, and replace the string with an invalid
|
|
- * sequence. The invalid sequence ensures that the escaped string
|
|
- * will trigger an error on the server-side, even if we can't
|
|
- * directly report an error here.
|
|
+ * Multibyte character is invalid. It's important to verify that
|
|
+ * as invalid multibyte characters could e.g. be used to "skip"
|
|
+ * over quote characters, e.g. when parsing
|
|
+ * character-by-character.
|
|
+ *
|
|
+ * Report an error if possible, and replace the character's first
|
|
+ * byte with an invalid sequence. The invalid sequence ensures
|
|
+ * that the escaped string will trigger an error on the
|
|
+ * server-side, even if we can't directly report an error here.
|
|
*
|
|
* This isn't *that* crucial when we can report an error to the
|
|
- * caller, but if we can't, the caller will use this string
|
|
- * unmodified and it needs to be safe for parsing.
|
|
+ * caller; but if we can't or the caller ignores it, the caller
|
|
+ * will use this string unmodified and it needs to be safe for
|
|
+ * parsing.
|
|
*
|
|
* We know there's enough space for the invalid sequence because
|
|
* the "to" buffer needs to be at least 2 * length + 1 long, and
|
|
@@ -3404,8 +3410,6 @@ PQescapeStringInternal(PGconn *conn,
|
|
source++;
|
|
remaining--;
|
|
|
|
- /* there's no more input data, so we can stop */
|
|
- break;
|
|
}
|
|
else
|
|
{
|
|
--
|
|
2.39.5 (Apple Git-154)
|
|
|
|
|
|
From 21118244dad366d20e1d11549df03dd56e76dbaa Mon Sep 17 00:00:00 2001
|
|
From: Tom Lane <tgl@sss.pgh.pa.us>
|
|
Date: Sun, 16 Feb 2025 12:46:35 -0500
|
|
Subject: [PATCH 6/8] In fmtIdEnc(), handle failure of enlargePQExpBuffer().
|
|
|
|
Coverity complained that we weren't doing that, and it's right.
|
|
|
|
This fix just makes fmtIdEnc() honor the general convention that OOM
|
|
causes a PQExpBuffer to become marked "broken", without any immediate
|
|
error. In the pretty-unlikely case that we actually did hit OOM here,
|
|
the end result would be to return an empty string to the caller,
|
|
probably resulting in invalid SQL syntax in an issued command (if
|
|
nothing else went wrong, which is even more unlikely). It's tempting
|
|
to throw an "out of memory" error if the buffer becomes broken, but
|
|
there's not a lot of point in doing that only here and not in hundreds
|
|
of other PQExpBuffer-using places in pg_dump and similar callers.
|
|
The whole issue could do with some non-time-crunched redesign, perhaps.
|
|
|
|
This is a followup to the fixes for CVE-2025-1094, and should be
|
|
included if cherry-picking those fixes.
|
|
---
|
|
src/fe_utils/string_utils.c | 12 +++++++-----
|
|
1 file changed, 7 insertions(+), 5 deletions(-)
|
|
|
|
diff --git a/src/fe_utils/string_utils.c b/src/fe_utils/string_utils.c
|
|
index a2d5ccd1e28..fe280df3c0f 100644
|
|
--- a/src/fe_utils/string_utils.c
|
|
+++ b/src/fe_utils/string_utils.c
|
|
@@ -202,11 +202,13 @@ fmtIdEnc(const char *rawid, int encoding)
|
|
* easier for users to find the invalidly encoded portion of a
|
|
* larger string.
|
|
*/
|
|
- enlargePQExpBuffer(id_return, 2);
|
|
- pg_encoding_set_invalid(encoding,
|
|
- id_return->data + id_return->len);
|
|
- id_return->len += 2;
|
|
- id_return->data[id_return->len] = '\0';
|
|
+ if (enlargePQExpBuffer(id_return, 2))
|
|
+ {
|
|
+ pg_encoding_set_invalid(encoding,
|
|
+ id_return->data + id_return->len);
|
|
+ id_return->len += 2;
|
|
+ id_return->data[id_return->len] = '\0';
|
|
+ }
|
|
|
|
/*
|
|
* Handle the following bytes as if this byte didn't exist.
|
|
--
|
|
2.39.5 (Apple Git-154)
|
|
|
|
|
|
From 6f42371a3c3911299c081afe3478022c496b07a9 Mon Sep 17 00:00:00 2001
|
|
From: Filip Janus <fjanus@redhat.com>
|
|
Date: Mon, 17 Mar 2025 18:14:05 +0100
|
|
Subject: [PATCH 7/8] Backport multiple changes from postgresql13, especially
|
|
wchar.c functionality from backend was moved to common directory, it means
|
|
that functionaity can be used by server but also by libpq. Due to the
|
|
necessary changes there are couple of "reverts" from previous commits in
|
|
src/backend/utils/mb/wchar.c but it's expected because now it's linked with
|
|
implementation from common/wchar.c instead src/backend/utils/mb/wchar.c
|
|
|
|
---
|
|
src/backend/utils/mb/wchar.c | 101 +-
|
|
src/bin/scripts/dropdb.c | 5 +-
|
|
src/common/Makefile | 2 +-
|
|
src/common/wchar.c | 1728 ++++++++++++++++++
|
|
src/include/common/unicode_combining_table.h | 196 ++
|
|
src/include/mb/pg_wchar.h | 4 +
|
|
src/interfaces/libpq/fe-exec.c | 7 +-
|
|
7 files changed, 1958 insertions(+), 85 deletions(-)
|
|
create mode 100644 src/common/wchar.c
|
|
create mode 100644 src/include/common/unicode_combining_table.h
|
|
|
|
diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c
|
|
index 872241cc804..1ca6094d2a3 100644
|
|
--- a/src/backend/utils/mb/wchar.c
|
|
+++ b/src/backend/utils/mb/wchar.c
|
|
@@ -14,25 +14,6 @@
|
|
#include "mb/pg_wchar.h"
|
|
|
|
|
|
-/*
|
|
- * In today's multibyte encodings other than UTF8, this two-byte sequence
|
|
- * ensures pg_encoding_mblen() == 2 && pg_encoding_verifymbstr() == 0.
|
|
- *
|
|
- * For historical reasons, several verifychar implementations opt to reject
|
|
- * this pair specifically. Byte pair range constraints, in encoding
|
|
- * originator documentation, always excluded this pair. No core conversion
|
|
- * could translate it. However, longstanding verifychar implementations
|
|
- * accepted any non-NUL byte. big5_to_euc_tw and big5_to_mic even translate
|
|
- * pairs not valid per encoding originator documentation. To avoid tightening
|
|
- * core or non-core conversions in a security patch, we sought this one pair.
|
|
- *
|
|
- * PQescapeString() historically used spaces for BYTE1; many other values
|
|
- * could suffice for BYTE1.
|
|
- */
|
|
-#define NONUTF8_INVALID_BYTE0 (0x8d)
|
|
-#define NONUTF8_INVALID_BYTE1 (' ')
|
|
-
|
|
-
|
|
/*
|
|
* Operations on multi-byte encodings are driven by a table of helper
|
|
* functions.
|
|
@@ -496,7 +477,7 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
|
|
/*
|
|
* Map a Unicode code point to UTF-8. utf8string must have 4 bytes of
|
|
* space allocated.
|
|
- */
|
|
+ *
|
|
unsigned char *
|
|
unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
|
|
{
|
|
@@ -525,7 +506,7 @@ unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
|
|
|
|
return utf8string;
|
|
}
|
|
-
|
|
+*/
|
|
/*
|
|
* Trivial conversion from pg_wchar to UTF-8.
|
|
* caller should allocate enough space for "to"
|
|
@@ -562,7 +543,7 @@ pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
|
|
*
|
|
* pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
|
|
* other places would need to be fixed to change this.
|
|
- */
|
|
+ *
|
|
int
|
|
pg_utf_mblen(const unsigned char *s)
|
|
{
|
|
@@ -586,7 +567,7 @@ pg_utf_mblen(const unsigned char *s)
|
|
len = 1;
|
|
return len;
|
|
}
|
|
-
|
|
+*/
|
|
/*
|
|
* This is an implementation of wcwidth() and wcswidth() as defined in
|
|
* "The Single UNIX Specification, Version 2, The Open Group, 1997"
|
|
@@ -765,7 +746,7 @@ ucs_wcwidth(pg_wchar ucs)
|
|
* This is a one-character version of pg_utf2wchar_with_len.
|
|
*
|
|
* No error checks here, c must point to a long-enough string.
|
|
- */
|
|
+ *
|
|
pg_wchar
|
|
utf8_to_unicode(const unsigned char *c)
|
|
{
|
|
@@ -784,10 +765,10 @@ utf8_to_unicode(const unsigned char *c)
|
|
((c[2] & 0x3f) << 6) |
|
|
(c[3] & 0x3f));
|
|
else
|
|
- /* that is an invalid code on purpose */
|
|
+ // that is an invalid code on purpose
|
|
return 0xffffffff;
|
|
}
|
|
-
|
|
+*/
|
|
static int
|
|
pg_utf_dsplen(const unsigned char *s)
|
|
{
|
|
@@ -917,7 +898,7 @@ pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
|
|
*to = 0;
|
|
return cnt;
|
|
}
|
|
-
|
|
+/*
|
|
int
|
|
pg_mule_mblen(const unsigned char *s)
|
|
{
|
|
@@ -932,9 +913,9 @@ pg_mule_mblen(const unsigned char *s)
|
|
else if (IS_LCPRV2(*s))
|
|
len = 4;
|
|
else
|
|
- len = 1; /* assume ASCII */
|
|
+ len = 1;
|
|
return len;
|
|
-}
|
|
+}*/
|
|
|
|
static int
|
|
pg_mule_dsplen(const unsigned char *s)
|
|
@@ -1413,11 +1394,6 @@ pg_big5_verifier(const unsigned char *s, int len)
|
|
if (len < l)
|
|
return -1;
|
|
|
|
- if (l == 2 &&
|
|
- s[0] == NONUTF8_INVALID_BYTE0 &&
|
|
- s[1] == NONUTF8_INVALID_BYTE1)
|
|
- return -1;
|
|
-
|
|
while (--l > 0)
|
|
{
|
|
if (*++s == '\0')
|
|
@@ -1438,11 +1414,6 @@ pg_gbk_verifier(const unsigned char *s, int len)
|
|
if (len < l)
|
|
return -1;
|
|
|
|
- if (l == 2 &&
|
|
- s[0] == NONUTF8_INVALID_BYTE0 &&
|
|
- s[1] == NONUTF8_INVALID_BYTE1)
|
|
- return -1;
|
|
-
|
|
while (--l > 0)
|
|
{
|
|
if (*++s == '\0')
|
|
@@ -1463,11 +1434,6 @@ pg_uhc_verifier(const unsigned char *s, int len)
|
|
if (len < l)
|
|
return -1;
|
|
|
|
- if (l == 2 &&
|
|
- s[0] == NONUTF8_INVALID_BYTE0 &&
|
|
- s[1] == NONUTF8_INVALID_BYTE1)
|
|
- return -1;
|
|
-
|
|
while (--l > 0)
|
|
{
|
|
if (*++s == '\0')
|
|
@@ -1535,7 +1501,7 @@ pg_utf8_verifier(const unsigned char *s, int len)
|
|
*
|
|
* length is assumed to have been obtained by pg_utf_mblen(), and the
|
|
* caller must have checked that that many bytes are present in the buffer.
|
|
- */
|
|
+ *
|
|
bool
|
|
pg_utf8_islegal(const unsigned char *source, int length)
|
|
{
|
|
@@ -1544,18 +1510,15 @@ pg_utf8_islegal(const unsigned char *source, int length)
|
|
switch (length)
|
|
{
|
|
default:
|
|
- /* reject lengths 5 and 6 for now */
|
|
return false;
|
|
case 4:
|
|
a = source[3];
|
|
if (a < 0x80 || a > 0xBF)
|
|
return false;
|
|
- /* FALL THRU */
|
|
case 3:
|
|
a = source[2];
|
|
if (a < 0x80 || a > 0xBF)
|
|
return false;
|
|
- /* FALL THRU */
|
|
case 2:
|
|
a = source[1];
|
|
switch (*source)
|
|
@@ -1581,7 +1544,6 @@ pg_utf8_islegal(const unsigned char *source, int length)
|
|
return false;
|
|
break;
|
|
}
|
|
- /* FALL THRU */
|
|
case 1:
|
|
a = *source;
|
|
if (a >= 0x80 && a < 0xC2)
|
|
@@ -1592,7 +1554,7 @@ pg_utf8_islegal(const unsigned char *source, int length)
|
|
}
|
|
return true;
|
|
}
|
|
-
|
|
+*/
|
|
#ifndef FRONTEND
|
|
|
|
/*
|
|
@@ -1802,26 +1764,13 @@ pg_eucjp_increment(unsigned char *charptr, int length)
|
|
#endif /* !FRONTEND */
|
|
|
|
|
|
-/*
|
|
- * Fills the provided buffer with two bytes such that:
|
|
- * pg_encoding_mblen(dst) == 2 && pg_encoding_verifymbstr(dst) == 0
|
|
- */
|
|
-void
|
|
-pg_encoding_set_invalid(int encoding, char *dst)
|
|
-{
|
|
- Assert(pg_encoding_max_length(encoding) > 1);
|
|
-
|
|
- dst[0] = (encoding == PG_UTF8 ? 0xc0 : NONUTF8_INVALID_BYTE0);
|
|
- dst[1] = NONUTF8_INVALID_BYTE1;
|
|
-}
|
|
-
|
|
/*
|
|
*-------------------------------------------------------------------
|
|
* encoding info table
|
|
* XXX must be sorted by the same order as enum pg_enc (in mb/pg_wchar.h)
|
|
*-------------------------------------------------------------------
|
|
*/
|
|
-const pg_wchar_tbl pg_wchar_table[] = {
|
|
+const pg_wchar_tbl pg_wchar_table1[] = {
|
|
{pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifier, 1}, /* PG_SQL_ASCII */
|
|
{pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3}, /* PG_EUC_JP */
|
|
{pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifier, 2}, /* PG_EUC_CN */
|
|
@@ -1875,7 +1824,7 @@ pg_mic_mblen(const unsigned char *mbstr)
|
|
|
|
/*
|
|
* Returns the byte length of a multibyte character.
|
|
- */
|
|
+ *
|
|
int
|
|
pg_encoding_mblen(int encoding, const char *mbstr)
|
|
{
|
|
@@ -1883,10 +1832,10 @@ pg_encoding_mblen(int encoding, const char *mbstr)
|
|
pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
|
|
pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
|
|
}
|
|
-
|
|
+*/
|
|
/*
|
|
* Returns the display length of a multibyte character.
|
|
- */
|
|
+ *
|
|
int
|
|
pg_encoding_dsplen(int encoding, const char *mbstr)
|
|
{
|
|
@@ -1894,12 +1843,12 @@ pg_encoding_dsplen(int encoding, const char *mbstr)
|
|
pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
|
|
pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
|
|
}
|
|
-
|
|
+*/
|
|
/*
|
|
* Verify the first multibyte character of the given string.
|
|
* Return its byte length if good, -1 if bad. (See comments above for
|
|
* full details of the mbverify API.)
|
|
- */
|
|
+ *
|
|
int
|
|
pg_encoding_verifymb(int encoding, const char *mbstr, int len)
|
|
{
|
|
@@ -1907,24 +1856,18 @@ pg_encoding_verifymb(int encoding, const char *mbstr, int len)
|
|
pg_wchar_table[encoding].mbverify((const unsigned char *) mbstr, len) :
|
|
pg_wchar_table[PG_SQL_ASCII].mbverify((const unsigned char *) mbstr, len));
|
|
}
|
|
-
|
|
+*/
|
|
/*
|
|
* fetch maximum length of a given encoding
|
|
- */
|
|
+ *
|
|
int
|
|
pg_encoding_max_length(int encoding)
|
|
{
|
|
Assert(PG_VALID_ENCODING(encoding));
|
|
|
|
- /*
|
|
- * Check for the encoding despite the assert, due to some mingw versions
|
|
- * otherwise issuing bogus warnings.
|
|
- */
|
|
- return PG_VALID_ENCODING(encoding) ?
|
|
- pg_wchar_table[encoding].maxmblen :
|
|
- pg_wchar_table[PG_SQL_ASCII].maxmblen;
|
|
+ return pg_wchar_table[encoding].maxmblen;
|
|
}
|
|
-
|
|
+*/
|
|
#ifndef FRONTEND
|
|
|
|
/*
|
|
diff --git a/src/bin/scripts/dropdb.c b/src/bin/scripts/dropdb.c
|
|
index 0d636d0ef46..ed3a2c8c19a 100644
|
|
--- a/src/bin/scripts/dropdb.c
|
|
+++ b/src/bin/scripts/dropdb.c
|
|
@@ -141,10 +141,9 @@ main(int argc, char *argv[])
|
|
conn = connectMaintenanceDatabase(&cparams, progname, echo);
|
|
|
|
initPQExpBuffer(&sql);
|
|
- appendPQExpBuffer(&sql, "DROP DATABASE %s%s%s;",
|
|
+ appendPQExpBuffer(&sql, "DROP DATABASE %s%s;",
|
|
(if_exists ? "IF EXISTS " : ""),
|
|
- fmtIdEnc(dbname, PQclientEncoding(conn)),
|
|
- force ? " WITH (FORCE)" : "");
|
|
+ fmtIdEnc(dbname, PQclientEncoding(conn)));
|
|
|
|
if (echo)
|
|
printf("%s\n", sql.data);
|
|
diff --git a/src/common/Makefile b/src/common/Makefile
|
|
index 2f22b9b101d..c26d938b31e 100644
|
|
--- a/src/common/Makefile
|
|
+++ b/src/common/Makefile
|
|
@@ -50,7 +50,7 @@ OBJS_COMMON = base64.o config_info.o controldata_utils.o d2s.o exec.o f2s.o \
|
|
file_perm.o ip.o keywords.o kwlookup.o link-canary.o md5.o \
|
|
pg_lzcompress.o pgfnames.o psprintf.o relpath.o \
|
|
rmtree.o saslprep.o scram-common.o string.o unicode_norm.o \
|
|
- username.o wait_error.o
|
|
+ username.o wait_error.o wchar.o
|
|
|
|
ifeq ($(with_openssl),yes)
|
|
OBJS_COMMON += sha2_openssl.o
|
|
diff --git a/src/common/wchar.c b/src/common/wchar.c
|
|
new file mode 100644
|
|
index 00000000000..85822b2c3b5
|
|
--- /dev/null
|
|
+++ b/src/common/wchar.c
|
|
@@ -0,0 +1,1728 @@
|
|
+/*-------------------------------------------------------------------------
|
|
+ *
|
|
+ * wchar.c
|
|
+ * Functions for working with multibyte characters in various encodings.
|
|
+ *
|
|
+ * Portions Copyright (c) 1998-2020, PostgreSQL Global Development Group
|
|
+ *
|
|
+ * IDENTIFICATION
|
|
+ * src/common/wchar.c
|
|
+ *
|
|
+ *-------------------------------------------------------------------------
|
|
+ */
|
|
+#include "c.h"
|
|
+
|
|
+#include "mb/pg_wchar.h"
|
|
+
|
|
+
|
|
+/*
|
|
+ * In today's multibyte encodings other than UTF8, this two-byte sequence
|
|
+ * ensures pg_encoding_mblen() == 2 && pg_encoding_verifymbstr() == 0.
|
|
+ *
|
|
+ * For historical reasons, several verifychar implementations opt to reject
|
|
+ * this pair specifically. Byte pair range constraints, in encoding
|
|
+ * originator documentation, always excluded this pair. No core conversion
|
|
+ * could translate it. However, longstanding verifychar implementations
|
|
+ * accepted any non-NUL byte. big5_to_euc_tw and big5_to_mic even translate
|
|
+ * pairs not valid per encoding originator documentation. To avoid tightening
|
|
+ * core or non-core conversions in a security patch, we sought this one pair.
|
|
+ *
|
|
+ * PQescapeString() historically used spaces for BYTE1; many other values
|
|
+ * could suffice for BYTE1.
|
|
+ */
|
|
+#define NONUTF8_INVALID_BYTE0 (0x8d)
|
|
+#define NONUTF8_INVALID_BYTE1 (' ')
|
|
+
|
|
+
|
|
+/*
|
|
+ * Operations on multi-byte encodings are driven by a table of helper
|
|
+ * functions.
|
|
+ *
|
|
+ * To add an encoding support, define mblen(), dsplen() and verifier() for
|
|
+ * the encoding. For server-encodings, also define mb2wchar() and wchar2mb()
|
|
+ * conversion functions.
|
|
+ *
|
|
+ * These functions generally assume that their input is validly formed.
|
|
+ * The "verifier" functions, further down in the file, have to be more
|
|
+ * paranoid.
|
|
+ *
|
|
+ * We expect that mblen() does not need to examine more than the first byte
|
|
+ * of the character to discover the correct length. GB18030 is an exception
|
|
+ * to that rule, though, as it also looks at second byte. But even that
|
|
+ * behaves in a predictable way, if you only pass the first byte: it will
|
|
+ * treat 4-byte encoded characters as two 2-byte encoded characters, which is
|
|
+ * good enough for all current uses.
|
|
+ *
|
|
+ * Note: for the display output of psql to work properly, the return values
|
|
+ * of the dsplen functions must conform to the Unicode standard. In particular
|
|
+ * the NUL character is zero width and control characters are generally
|
|
+ * width -1. It is recommended that non-ASCII encodings refer their ASCII
|
|
+ * subset to the ASCII routines to ensure consistency.
|
|
+ */
|
|
+
|
|
+/*
|
|
+ * SQL/ASCII
|
|
+ */
|
|
+static int
|
|
+pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
|
|
+{
|
|
+ int cnt = 0;
|
|
+
|
|
+ while (len > 0 && *from)
|
|
+ {
|
|
+ *to++ = *from++;
|
|
+ len--;
|
|
+ cnt++;
|
|
+ }
|
|
+ *to = 0;
|
|
+ return cnt;
|
|
+}
|
|
+
|
|
+static int
|
|
+pg_ascii_mblen(const unsigned char *s)
|
|
+{
|
|
+ return 1;
|
|
+}
|
|
+
|
|
+static int
|
|
+pg_ascii_dsplen(const unsigned char *s)
|
|
+{
|
|
+ if (*s == '\0')
|
|
+ return 0;
|
|
+ if (*s < 0x20 || *s == 0x7f)
|
|
+ return -1;
|
|
+
|
|
+ return 1;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * EUC
|
|
+ */
|
|
+static int
|
|
+pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
|
|
+{
|
|
+ int cnt = 0;
|
|
+
|
|
+ while (len > 0 && *from)
|
|
+ {
|
|
+ if (*from == SS2 && len >= 2) /* JIS X 0201 (so called "1 byte
|
|
+ * KANA") */
|
|
+ {
|
|
+ from++;
|
|
+ *to = (SS2 << 8) | *from++;
|
|
+ len -= 2;
|
|
+ }
|
|
+ else if (*from == SS3 && len >= 3) /* JIS X 0212 KANJI */
|
|
+ {
|
|
+ from++;
|
|
+ *to = (SS3 << 16) | (*from++ << 8);
|
|
+ *to |= *from++;
|
|
+ len -= 3;
|
|
+ }
|
|
+ else if (IS_HIGHBIT_SET(*from) && len >= 2) /* JIS X 0208 KANJI */
|
|
+ {
|
|
+ *to = *from++ << 8;
|
|
+ *to |= *from++;
|
|
+ len -= 2;
|
|
+ }
|
|
+ else /* must be ASCII */
|
|
+ {
|
|
+ *to = *from++;
|
|
+ len--;
|
|
+ }
|
|
+ to++;
|
|
+ cnt++;
|
|
+ }
|
|
+ *to = 0;
|
|
+ return cnt;
|
|
+}
|
|
+
|
|
+static inline int
|
|
+pg_euc_mblen(const unsigned char *s)
|
|
+{
|
|
+ int len;
|
|
+
|
|
+ if (*s == SS2)
|
|
+ len = 2;
|
|
+ else if (*s == SS3)
|
|
+ len = 3;
|
|
+ else if (IS_HIGHBIT_SET(*s))
|
|
+ len = 2;
|
|
+ else
|
|
+ len = 1;
|
|
+ return len;
|
|
+}
|
|
+
|
|
+static inline int
|
|
+pg_euc_dsplen(const unsigned char *s)
|
|
+{
|
|
+ int len;
|
|
+
|
|
+ if (*s == SS2)
|
|
+ len = 2;
|
|
+ else if (*s == SS3)
|
|
+ len = 2;
|
|
+ else if (IS_HIGHBIT_SET(*s))
|
|
+ len = 2;
|
|
+ else
|
|
+ len = pg_ascii_dsplen(s);
|
|
+ return len;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * EUC_JP
|
|
+ */
|
|
+static int
|
|
+pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
|
|
+{
|
|
+ return pg_euc2wchar_with_len(from, to, len);
|
|
+}
|
|
+
|
|
+static int
|
|
+pg_eucjp_mblen(const unsigned char *s)
|
|
+{
|
|
+ return pg_euc_mblen(s);
|
|
+}
|
|
+
|
|
+static int
|
|
+pg_eucjp_dsplen(const unsigned char *s)
|
|
+{
|
|
+ int len;
|
|
+
|
|
+ if (*s == SS2)
|
|
+ len = 1;
|
|
+ else if (*s == SS3)
|
|
+ len = 2;
|
|
+ else if (IS_HIGHBIT_SET(*s))
|
|
+ len = 2;
|
|
+ else
|
|
+ len = pg_ascii_dsplen(s);
|
|
+ return len;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * EUC_KR
|
|
+ */
|
|
+static int
|
|
+pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
|
|
+{
|
|
+ return pg_euc2wchar_with_len(from, to, len);
|
|
+}
|
|
+
|
|
+static int
|
|
+pg_euckr_mblen(const unsigned char *s)
|
|
+{
|
|
+ return pg_euc_mblen(s);
|
|
+}
|
|
+
|
|
+static int
|
|
+pg_euckr_dsplen(const unsigned char *s)
|
|
+{
|
|
+ return pg_euc_dsplen(s);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * EUC_CN
|
|
+ *
|
|
+ */
|
|
+static int
|
|
+pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
|
|
+{
|
|
+ int cnt = 0;
|
|
+
|
|
+ while (len > 0 && *from)
|
|
+ {
|
|
+ if (*from == SS2 && len >= 3) /* code set 2 (unused?) */
|
|
+ {
|
|
+ from++;
|
|
+ *to = (SS2 << 16) | (*from++ << 8);
|
|
+ *to |= *from++;
|
|
+ len -= 3;
|
|
+ }
|
|
+ else if (*from == SS3 && len >= 3) /* code set 3 (unused ?) */
|
|
+ {
|
|
+ from++;
|
|
+ *to = (SS3 << 16) | (*from++ << 8);
|
|
+ *to |= *from++;
|
|
+ len -= 3;
|
|
+ }
|
|
+ else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 1 */
|
|
+ {
|
|
+ *to = *from++ << 8;
|
|
+ *to |= *from++;
|
|
+ len -= 2;
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ *to = *from++;
|
|
+ len--;
|
|
+ }
|
|
+ to++;
|
|
+ cnt++;
|
|
+ }
|
|
+ *to = 0;
|
|
+ return cnt;
|
|
+}
|
|
+
|
|
+static int
|
|
+pg_euccn_mblen(const unsigned char *s)
|
|
+{
|
|
+ int len;
|
|
+
|
|
+ if (IS_HIGHBIT_SET(*s))
|
|
+ len = 2;
|
|
+ else
|
|
+ len = 1;
|
|
+ return len;
|
|
+}
|
|
+
|
|
+static int
|
|
+pg_euccn_dsplen(const unsigned char *s)
|
|
+{
|
|
+ int len;
|
|
+
|
|
+ if (IS_HIGHBIT_SET(*s))
|
|
+ len = 2;
|
|
+ else
|
|
+ len = pg_ascii_dsplen(s);
|
|
+ return len;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * EUC_TW
|
|
+ *
|
|
+ */
|
|
+static int
|
|
+pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
|
|
+{
|
|
+ int cnt = 0;
|
|
+
|
|
+ while (len > 0 && *from)
|
|
+ {
|
|
+ if (*from == SS2 && len >= 4) /* code set 2 */
|
|
+ {
|
|
+ from++;
|
|
+ *to = (((uint32) SS2) << 24) | (*from++ << 16);
|
|
+ *to |= *from++ << 8;
|
|
+ *to |= *from++;
|
|
+ len -= 4;
|
|
+ }
|
|
+ else if (*from == SS3 && len >= 3) /* code set 3 (unused?) */
|
|
+ {
|
|
+ from++;
|
|
+ *to = (SS3 << 16) | (*from++ << 8);
|
|
+ *to |= *from++;
|
|
+ len -= 3;
|
|
+ }
|
|
+ else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 2 */
|
|
+ {
|
|
+ *to = *from++ << 8;
|
|
+ *to |= *from++;
|
|
+ len -= 2;
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ *to = *from++;
|
|
+ len--;
|
|
+ }
|
|
+ to++;
|
|
+ cnt++;
|
|
+ }
|
|
+ *to = 0;
|
|
+ return cnt;
|
|
+}
|
|
+
|
|
+static int
|
|
+pg_euctw_mblen(const unsigned char *s)
|
|
+{
|
|
+ int len;
|
|
+
|
|
+ if (*s == SS2)
|
|
+ len = 4;
|
|
+ else if (*s == SS3)
|
|
+ len = 3;
|
|
+ else if (IS_HIGHBIT_SET(*s))
|
|
+ len = 2;
|
|
+ else
|
|
+ len = 1;
|
|
+ return len;
|
|
+}
|
|
+
|
|
+static int
|
|
+pg_euctw_dsplen(const unsigned char *s)
|
|
+{
|
|
+ int len;
|
|
+
|
|
+ if (*s == SS2)
|
|
+ len = 2;
|
|
+ else if (*s == SS3)
|
|
+ len = 2;
|
|
+ else if (IS_HIGHBIT_SET(*s))
|
|
+ len = 2;
|
|
+ else
|
|
+ len = pg_ascii_dsplen(s);
|
|
+ return len;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Convert pg_wchar to EUC_* encoding.
|
|
+ * caller must allocate enough space for "to", including a trailing zero!
|
|
+ * len: length of from.
|
|
+ * "from" not necessarily null terminated.
|
|
+ */
|
|
+static int
|
|
+pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
|
|
+{
|
|
+ int cnt = 0;
|
|
+
|
|
+ while (len > 0 && *from)
|
|
+ {
|
|
+ unsigned char c;
|
|
+
|
|
+ if ((c = (*from >> 24)))
|
|
+ {
|
|
+ *to++ = c;
|
|
+ *to++ = (*from >> 16) & 0xff;
|
|
+ *to++ = (*from >> 8) & 0xff;
|
|
+ *to++ = *from & 0xff;
|
|
+ cnt += 4;
|
|
+ }
|
|
+ else if ((c = (*from >> 16)))
|
|
+ {
|
|
+ *to++ = c;
|
|
+ *to++ = (*from >> 8) & 0xff;
|
|
+ *to++ = *from & 0xff;
|
|
+ cnt += 3;
|
|
+ }
|
|
+ else if ((c = (*from >> 8)))
|
|
+ {
|
|
+ *to++ = c;
|
|
+ *to++ = *from & 0xff;
|
|
+ cnt += 2;
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ *to++ = *from;
|
|
+ cnt++;
|
|
+ }
|
|
+ from++;
|
|
+ len--;
|
|
+ }
|
|
+ *to = 0;
|
|
+ return cnt;
|
|
+}
|
|
+
|
|
+
|
|
+/*
|
|
+ * JOHAB
|
|
+ */
|
|
+static int
|
|
+pg_johab_mblen(const unsigned char *s)
|
|
+{
|
|
+ return pg_euc_mblen(s);
|
|
+}
|
|
+
|
|
+static int
|
|
+pg_johab_dsplen(const unsigned char *s)
|
|
+{
|
|
+ return pg_euc_dsplen(s);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * convert UTF8 string to pg_wchar (UCS-4)
|
|
+ * caller must allocate enough space for "to", including a trailing zero!
|
|
+ * len: length of from.
|
|
+ * "from" not necessarily null terminated.
|
|
+ */
|
|
+static int
|
|
+pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
|
|
+{
|
|
+ int cnt = 0;
|
|
+ uint32 c1,
|
|
+ c2,
|
|
+ c3,
|
|
+ c4;
|
|
+
|
|
+ while (len > 0 && *from)
|
|
+ {
|
|
+ if ((*from & 0x80) == 0)
|
|
+ {
|
|
+ *to = *from++;
|
|
+ len--;
|
|
+ }
|
|
+ else if ((*from & 0xe0) == 0xc0)
|
|
+ {
|
|
+ if (len < 2)
|
|
+ break; /* drop trailing incomplete char */
|
|
+ c1 = *from++ & 0x1f;
|
|
+ c2 = *from++ & 0x3f;
|
|
+ *to = (c1 << 6) | c2;
|
|
+ len -= 2;
|
|
+ }
|
|
+ else if ((*from & 0xf0) == 0xe0)
|
|
+ {
|
|
+ if (len < 3)
|
|
+ break; /* drop trailing incomplete char */
|
|
+ c1 = *from++ & 0x0f;
|
|
+ c2 = *from++ & 0x3f;
|
|
+ c3 = *from++ & 0x3f;
|
|
+ *to = (c1 << 12) | (c2 << 6) | c3;
|
|
+ len -= 3;
|
|
+ }
|
|
+ else if ((*from & 0xf8) == 0xf0)
|
|
+ {
|
|
+ if (len < 4)
|
|
+ break; /* drop trailing incomplete char */
|
|
+ c1 = *from++ & 0x07;
|
|
+ c2 = *from++ & 0x3f;
|
|
+ c3 = *from++ & 0x3f;
|
|
+ c4 = *from++ & 0x3f;
|
|
+ *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
|
|
+ len -= 4;
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ /* treat a bogus char as length 1; not ours to raise error */
|
|
+ *to = *from++;
|
|
+ len--;
|
|
+ }
|
|
+ to++;
|
|
+ cnt++;
|
|
+ }
|
|
+ *to = 0;
|
|
+ return cnt;
|
|
+}
|
|
+
|
|
+
|
|
+/*
|
|
+ * Map a Unicode code point to UTF-8. utf8string must have 4 bytes of
|
|
+ * space allocated.
|
|
+ */
|
|
+unsigned char *
|
|
+unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
|
|
+{
|
|
+ if (c <= 0x7F)
|
|
+ {
|
|
+ utf8string[0] = c;
|
|
+ }
|
|
+ else if (c <= 0x7FF)
|
|
+ {
|
|
+ utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);
|
|
+ utf8string[1] = 0x80 | (c & 0x3F);
|
|
+ }
|
|
+ else if (c <= 0xFFFF)
|
|
+ {
|
|
+ utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);
|
|
+ utf8string[1] = 0x80 | ((c >> 6) & 0x3F);
|
|
+ utf8string[2] = 0x80 | (c & 0x3F);
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ utf8string[0] = 0xF0 | ((c >> 18) & 0x07);
|
|
+ utf8string[1] = 0x80 | ((c >> 12) & 0x3F);
|
|
+ utf8string[2] = 0x80 | ((c >> 6) & 0x3F);
|
|
+ utf8string[3] = 0x80 | (c & 0x3F);
|
|
+ }
|
|
+
|
|
+ return utf8string;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Trivial conversion from pg_wchar to UTF-8.
|
|
+ * caller should allocate enough space for "to"
|
|
+ * len: length of from.
|
|
+ * "from" not necessarily null terminated.
|
|
+ */
|
|
+static int
|
|
+pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
|
|
+{
|
|
+ int cnt = 0;
|
|
+
|
|
+ while (len > 0 && *from)
|
|
+ {
|
|
+ int char_len;
|
|
+
|
|
+ unicode_to_utf8(*from, to);
|
|
+ char_len = pg_utf_mblen(to);
|
|
+ cnt += char_len;
|
|
+ to += char_len;
|
|
+ from++;
|
|
+ len--;
|
|
+ }
|
|
+ *to = 0;
|
|
+ return cnt;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Return the byte length of a UTF8 character pointed to by s
|
|
+ *
|
|
+ * Note: in the current implementation we do not support UTF8 sequences
|
|
+ * of more than 4 bytes; hence do NOT return a value larger than 4.
|
|
+ * We return "1" for any leading byte that is either flat-out illegal or
|
|
+ * indicates a length larger than we support.
|
|
+ *
|
|
+ * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
|
|
+ * other places would need to be fixed to change this.
|
|
+ */
|
|
+int
|
|
+pg_utf_mblen(const unsigned char *s)
|
|
+{
|
|
+ int len;
|
|
+
|
|
+ if ((*s & 0x80) == 0)
|
|
+ len = 1;
|
|
+ else if ((*s & 0xe0) == 0xc0)
|
|
+ len = 2;
|
|
+ else if ((*s & 0xf0) == 0xe0)
|
|
+ len = 3;
|
|
+ else if ((*s & 0xf8) == 0xf0)
|
|
+ len = 4;
|
|
+#ifdef NOT_USED
|
|
+ else if ((*s & 0xfc) == 0xf8)
|
|
+ len = 5;
|
|
+ else if ((*s & 0xfe) == 0xfc)
|
|
+ len = 6;
|
|
+#endif
|
|
+ else
|
|
+ len = 1;
|
|
+ return len;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * This is an implementation of wcwidth() and wcswidth() as defined in
|
|
+ * "The Single UNIX Specification, Version 2, The Open Group, 1997"
|
|
+ * <http://www.unix.org/online.html>
|
|
+ *
|
|
+ * Markus Kuhn -- 2001-09-08 -- public domain
|
|
+ *
|
|
+ * customised for PostgreSQL
|
|
+ *
|
|
+ * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
|
|
+ */
|
|
+
|
|
+struct mbinterval
|
|
+{
|
|
+ unsigned short first;
|
|
+ unsigned short last;
|
|
+};
|
|
+
|
|
+/* auxiliary function for binary search in interval table */
|
|
+static int
|
|
+mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
|
|
+{
|
|
+ int min = 0;
|
|
+ int mid;
|
|
+
|
|
+ if (ucs < table[0].first || ucs > table[max].last)
|
|
+ return 0;
|
|
+ while (max >= min)
|
|
+ {
|
|
+ mid = (min + max) / 2;
|
|
+ if (ucs > table[mid].last)
|
|
+ min = mid + 1;
|
|
+ else if (ucs < table[mid].first)
|
|
+ max = mid - 1;
|
|
+ else
|
|
+ return 1;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+
|
|
+/* The following functions define the column width of an ISO 10646
|
|
+ * character as follows:
|
|
+ *
|
|
+ * - The null character (U+0000) has a column width of 0.
|
|
+ *
|
|
+ * - Other C0/C1 control characters and DEL will lead to a return
|
|
+ * value of -1.
|
|
+ *
|
|
+ * - Non-spacing and enclosing combining characters (general
|
|
+ * category code Mn or Me in the Unicode database) have a
|
|
+ * column width of 0.
|
|
+ *
|
|
+ * - Other format characters (general category code Cf in the Unicode
|
|
+ * database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
|
|
+ *
|
|
+ * - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
|
|
+ * have a column width of 0.
|
|
+ *
|
|
+ * - Spacing characters in the East Asian Wide (W) or East Asian
|
|
+ * FullWidth (F) category as defined in Unicode Technical
|
|
+ * Report #11 have a column width of 2.
|
|
+ *
|
|
+ * - All remaining characters (including all printable
|
|
+ * ISO 8859-1 and WGL4 characters, Unicode control characters,
|
|
+ * etc.) have a column width of 1.
|
|
+ *
|
|
+ * This implementation assumes that wchar_t characters are encoded
|
|
+ * in ISO 10646.
|
|
+ */
|
|
+
|
|
+static int
|
|
+ucs_wcwidth(pg_wchar ucs)
|
|
+{
|
|
+#include "common/unicode_combining_table.h"
|
|
+
|
|
+ /* test for 8-bit control characters */
|
|
+ if (ucs == 0)
|
|
+ return 0;
|
|
+
|
|
+ if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
|
|
+ return -1;
|
|
+
|
|
+ /* binary search in table of non-spacing characters */
|
|
+ if (mbbisearch(ucs, combining,
|
|
+ sizeof(combining) / sizeof(struct mbinterval) - 1))
|
|
+ return 0;
|
|
+
|
|
+ /*
|
|
+ * if we arrive here, ucs is not a combining or C0/C1 control character
|
|
+ */
|
|
+
|
|
+ return 1 +
|
|
+ (ucs >= 0x1100 &&
|
|
+ (ucs <= 0x115f || /* Hangul Jamo init. consonants */
|
|
+ (ucs >= 0x2e80 && ucs <= 0xa4cf && (ucs & ~0x0011) != 0x300a &&
|
|
+ ucs != 0x303f) || /* CJK ... Yi */
|
|
+ (ucs >= 0xac00 && ucs <= 0xd7a3) || /* Hangul Syllables */
|
|
+ (ucs >= 0xf900 && ucs <= 0xfaff) || /* CJK Compatibility
|
|
+ * Ideographs */
|
|
+ (ucs >= 0xfe30 && ucs <= 0xfe6f) || /* CJK Compatibility Forms */
|
|
+ (ucs >= 0xff00 && ucs <= 0xff5f) || /* Fullwidth Forms */
|
|
+ (ucs >= 0xffe0 && ucs <= 0xffe6) ||
|
|
+ (ucs >= 0x20000 && ucs <= 0x2ffff)));
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Convert a UTF-8 character to a Unicode code point.
|
|
+ * This is a one-character version of pg_utf2wchar_with_len.
|
|
+ *
|
|
+ * No error checks here, c must point to a long-enough string.
|
|
+ */
|
|
+pg_wchar
|
|
+utf8_to_unicode(const unsigned char *c)
|
|
+{
|
|
+ if ((*c & 0x80) == 0)
|
|
+ return (pg_wchar) c[0];
|
|
+ else if ((*c & 0xe0) == 0xc0)
|
|
+ return (pg_wchar) (((c[0] & 0x1f) << 6) |
|
|
+ (c[1] & 0x3f));
|
|
+ else if ((*c & 0xf0) == 0xe0)
|
|
+ return (pg_wchar) (((c[0] & 0x0f) << 12) |
|
|
+ ((c[1] & 0x3f) << 6) |
|
|
+ (c[2] & 0x3f));
|
|
+ else if ((*c & 0xf8) == 0xf0)
|
|
+ return (pg_wchar) (((c[0] & 0x07) << 18) |
|
|
+ ((c[1] & 0x3f) << 12) |
|
|
+ ((c[2] & 0x3f) << 6) |
|
|
+ (c[3] & 0x3f));
|
|
+ else
|
|
+ /* that is an invalid code on purpose */
|
|
+ return 0xffffffff;
|
|
+}
|
|
+
|
|
+static int
|
|
+pg_utf_dsplen(const unsigned char *s)
|
|
+{
|
|
+ return ucs_wcwidth(utf8_to_unicode(s));
|
|
+}
|
|
+
|
|
+/*
|
|
+ * convert mule internal code to pg_wchar
|
|
+ * caller should allocate enough space for "to"
|
|
+ * len: length of from.
|
|
+ * "from" not necessarily null terminated.
|
|
+ */
|
|
+static int
|
|
+pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
|
|
+{
|
|
+ int cnt = 0;
|
|
+
|
|
+ while (len > 0 && *from)
|
|
+ {
|
|
+ if (IS_LC1(*from) && len >= 2)
|
|
+ {
|
|
+ *to = *from++ << 16;
|
|
+ *to |= *from++;
|
|
+ len -= 2;
|
|
+ }
|
|
+ else if (IS_LCPRV1(*from) && len >= 3)
|
|
+ {
|
|
+ from++;
|
|
+ *to = *from++ << 16;
|
|
+ *to |= *from++;
|
|
+ len -= 3;
|
|
+ }
|
|
+ else if (IS_LC2(*from) && len >= 3)
|
|
+ {
|
|
+ *to = *from++ << 16;
|
|
+ *to |= *from++ << 8;
|
|
+ *to |= *from++;
|
|
+ len -= 3;
|
|
+ }
|
|
+ else if (IS_LCPRV2(*from) && len >= 4)
|
|
+ {
|
|
+ from++;
|
|
+ *to = *from++ << 16;
|
|
+ *to |= *from++ << 8;
|
|
+ *to |= *from++;
|
|
+ len -= 4;
|
|
+ }
|
|
+ else
|
|
+ { /* assume ASCII */
|
|
+ *to = (unsigned char) *from++;
|
|
+ len--;
|
|
+ }
|
|
+ to++;
|
|
+ cnt++;
|
|
+ }
|
|
+ *to = 0;
|
|
+ return cnt;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * convert pg_wchar to mule internal code
|
|
+ * caller should allocate enough space for "to"
|
|
+ * len: length of from.
|
|
+ * "from" not necessarily null terminated.
|
|
+ */
|
|
+static int
|
|
+pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
|
|
+{
|
|
+ int cnt = 0;
|
|
+
|
|
+ while (len > 0 && *from)
|
|
+ {
|
|
+ unsigned char lb;
|
|
+
|
|
+ lb = (*from >> 16) & 0xff;
|
|
+ if (IS_LC1(lb))
|
|
+ {
|
|
+ *to++ = lb;
|
|
+ *to++ = *from & 0xff;
|
|
+ cnt += 2;
|
|
+ }
|
|
+ else if (IS_LC2(lb))
|
|
+ {
|
|
+ *to++ = lb;
|
|
+ *to++ = (*from >> 8) & 0xff;
|
|
+ *to++ = *from & 0xff;
|
|
+ cnt += 3;
|
|
+ }
|
|
+ else if (IS_LCPRV1_A_RANGE(lb))
|
|
+ {
|
|
+ *to++ = LCPRV1_A;
|
|
+ *to++ = lb;
|
|
+ *to++ = *from & 0xff;
|
|
+ cnt += 3;
|
|
+ }
|
|
+ else if (IS_LCPRV1_B_RANGE(lb))
|
|
+ {
|
|
+ *to++ = LCPRV1_B;
|
|
+ *to++ = lb;
|
|
+ *to++ = *from & 0xff;
|
|
+ cnt += 3;
|
|
+ }
|
|
+ else if (IS_LCPRV2_A_RANGE(lb))
|
|
+ {
|
|
+ *to++ = LCPRV2_A;
|
|
+ *to++ = lb;
|
|
+ *to++ = (*from >> 8) & 0xff;
|
|
+ *to++ = *from & 0xff;
|
|
+ cnt += 4;
|
|
+ }
|
|
+ else if (IS_LCPRV2_B_RANGE(lb))
|
|
+ {
|
|
+ *to++ = LCPRV2_B;
|
|
+ *to++ = lb;
|
|
+ *to++ = (*from >> 8) & 0xff;
|
|
+ *to++ = *from & 0xff;
|
|
+ cnt += 4;
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ *to++ = *from & 0xff;
|
|
+ cnt += 1;
|
|
+ }
|
|
+ from++;
|
|
+ len--;
|
|
+ }
|
|
+ *to = 0;
|
|
+ return cnt;
|
|
+}
|
|
+
|
|
+/* exported for direct use by conv.c */
|
|
+int
|
|
+pg_mule_mblen(const unsigned char *s)
|
|
+{
|
|
+ int len;
|
|
+
|
|
+ if (IS_LC1(*s))
|
|
+ len = 2;
|
|
+ else if (IS_LCPRV1(*s))
|
|
+ len = 3;
|
|
+ else if (IS_LC2(*s))
|
|
+ len = 3;
|
|
+ else if (IS_LCPRV2(*s))
|
|
+ len = 4;
|
|
+ else
|
|
+ len = 1; /* assume ASCII */
|
|
+ return len;
|
|
+}
|
|
+
|
|
+static int
|
|
+pg_mule_dsplen(const unsigned char *s)
|
|
+{
|
|
+ int len;
|
|
+
|
|
+ /*
|
|
+ * Note: it's not really appropriate to assume that all multibyte charsets
|
|
+ * are double-wide on screen. But this seems an okay approximation for
|
|
+ * the MULE charsets we currently support.
|
|
+ */
|
|
+
|
|
+ if (IS_LC1(*s))
|
|
+ len = 1;
|
|
+ else if (IS_LCPRV1(*s))
|
|
+ len = 1;
|
|
+ else if (IS_LC2(*s))
|
|
+ len = 2;
|
|
+ else if (IS_LCPRV2(*s))
|
|
+ len = 2;
|
|
+ else
|
|
+ len = 1; /* assume ASCII */
|
|
+
|
|
+ return len;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * ISO8859-1
|
|
+ */
|
|
+static int
|
|
+pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
|
|
+{
|
|
+ int cnt = 0;
|
|
+
|
|
+ while (len > 0 && *from)
|
|
+ {
|
|
+ *to++ = *from++;
|
|
+ len--;
|
|
+ cnt++;
|
|
+ }
|
|
+ *to = 0;
|
|
+ return cnt;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Trivial conversion from pg_wchar to single byte encoding. Just ignores
|
|
+ * high bits.
|
|
+ * caller should allocate enough space for "to"
|
|
+ * len: length of from.
|
|
+ * "from" not necessarily null terminated.
|
|
+ */
|
|
+static int
|
|
+pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
|
|
+{
|
|
+ int cnt = 0;
|
|
+
|
|
+ while (len > 0 && *from)
|
|
+ {
|
|
+ *to++ = *from++;
|
|
+ len--;
|
|
+ cnt++;
|
|
+ }
|
|
+ *to = 0;
|
|
+ return cnt;
|
|
+}
|
|
+
|
|
+static int
|
|
+pg_latin1_mblen(const unsigned char *s)
|
|
+{
|
|
+ return 1;
|
|
+}
|
|
+
|
|
+static int
|
|
+pg_latin1_dsplen(const unsigned char *s)
|
|
+{
|
|
+ return pg_ascii_dsplen(s);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * SJIS
|
|
+ */
|
|
+static int
|
|
+pg_sjis_mblen(const unsigned char *s)
|
|
+{
|
|
+ int len;
|
|
+
|
|
+ if (*s >= 0xa1 && *s <= 0xdf)
|
|
+ len = 1; /* 1 byte kana? */
|
|
+ else if (IS_HIGHBIT_SET(*s))
|
|
+ len = 2; /* kanji? */
|
|
+ else
|
|
+ len = 1; /* should be ASCII */
|
|
+ return len;
|
|
+}
|
|
+
|
|
+static int
|
|
+pg_sjis_dsplen(const unsigned char *s)
|
|
+{
|
|
+ int len;
|
|
+
|
|
+ if (*s >= 0xa1 && *s <= 0xdf)
|
|
+ len = 1; /* 1 byte kana? */
|
|
+ else if (IS_HIGHBIT_SET(*s))
|
|
+ len = 2; /* kanji? */
|
|
+ else
|
|
+ len = pg_ascii_dsplen(s); /* should be ASCII */
|
|
+ return len;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Big5
|
|
+ */
|
|
+static int
|
|
+pg_big5_mblen(const unsigned char *s)
|
|
+{
|
|
+ int len;
|
|
+
|
|
+ if (IS_HIGHBIT_SET(*s))
|
|
+ len = 2; /* kanji? */
|
|
+ else
|
|
+ len = 1; /* should be ASCII */
|
|
+ return len;
|
|
+}
|
|
+
|
|
+static int
|
|
+pg_big5_dsplen(const unsigned char *s)
|
|
+{
|
|
+ int len;
|
|
+
|
|
+ if (IS_HIGHBIT_SET(*s))
|
|
+ len = 2; /* kanji? */
|
|
+ else
|
|
+ len = pg_ascii_dsplen(s); /* should be ASCII */
|
|
+ return len;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * GBK
|
|
+ */
|
|
+static int
|
|
+pg_gbk_mblen(const unsigned char *s)
|
|
+{
|
|
+ int len;
|
|
+
|
|
+ if (IS_HIGHBIT_SET(*s))
|
|
+ len = 2; /* kanji? */
|
|
+ else
|
|
+ len = 1; /* should be ASCII */
|
|
+ return len;
|
|
+}
|
|
+
|
|
+static int
|
|
+pg_gbk_dsplen(const unsigned char *s)
|
|
+{
|
|
+ int len;
|
|
+
|
|
+ if (IS_HIGHBIT_SET(*s))
|
|
+ len = 2; /* kanji? */
|
|
+ else
|
|
+ len = pg_ascii_dsplen(s); /* should be ASCII */
|
|
+ return len;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * UHC
|
|
+ */
|
|
+static int
|
|
+pg_uhc_mblen(const unsigned char *s)
|
|
+{
|
|
+ int len;
|
|
+
|
|
+ if (IS_HIGHBIT_SET(*s))
|
|
+ len = 2; /* 2byte? */
|
|
+ else
|
|
+ len = 1; /* should be ASCII */
|
|
+ return len;
|
|
+}
|
|
+
|
|
+static int
|
|
+pg_uhc_dsplen(const unsigned char *s)
|
|
+{
|
|
+ int len;
|
|
+
|
|
+ if (IS_HIGHBIT_SET(*s))
|
|
+ len = 2; /* 2byte? */
|
|
+ else
|
|
+ len = pg_ascii_dsplen(s); /* should be ASCII */
|
|
+ return len;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * GB18030
|
|
+ * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
|
|
+ */
|
|
+
|
|
+/*
|
|
+ * Unlike all other mblen() functions, this also looks at the second byte of
|
|
+ * the input. However, if you only pass the first byte of a multi-byte
|
|
+ * string, and \0 as the second byte, this still works in a predictable way:
|
|
+ * a 4-byte character will be reported as two 2-byte characters. That's
|
|
+ * enough for all current uses, as a client-only encoding. It works that
|
|
+ * way, because in any valid 4-byte GB18030-encoded character, the third and
|
|
+ * fourth byte look like a 2-byte encoded character, when looked at
|
|
+ * separately.
|
|
+ */
|
|
+static int
|
|
+pg_gb18030_mblen(const unsigned char *s)
|
|
+{
|
|
+ int len;
|
|
+
|
|
+ if (!IS_HIGHBIT_SET(*s))
|
|
+ len = 1; /* ASCII */
|
|
+ else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
|
|
+ len = 4;
|
|
+ else
|
|
+ len = 2;
|
|
+ return len;
|
|
+}
|
|
+
|
|
+static int
|
|
+pg_gb18030_dsplen(const unsigned char *s)
|
|
+{
|
|
+ int len;
|
|
+
|
|
+ if (IS_HIGHBIT_SET(*s))
|
|
+ len = 2;
|
|
+ else
|
|
+ len = pg_ascii_dsplen(s); /* ASCII */
|
|
+ return len;
|
|
+}
|
|
+
|
|
+/*
|
|
+ *-------------------------------------------------------------------
|
|
+ * multibyte sequence validators
|
|
+ *
|
|
+ * These functions accept "s", a pointer to the first byte of a string,
|
|
+ * and "len", the remaining length of the string. If there is a validly
|
|
+ * encoded character beginning at *s, return its length in bytes; else
|
|
+ * return -1.
|
|
+ *
|
|
+ * The functions can assume that len > 0 and that *s != '\0', but they must
|
|
+ * test for and reject zeroes in any additional bytes of a multibyte character.
|
|
+ *
|
|
+ * Note that this definition allows the function for a single-byte
|
|
+ * encoding to be just "return 1".
|
|
+ *-------------------------------------------------------------------
|
|
+ */
|
|
+
|
|
+static int
|
|
+pg_ascii_verifier(const unsigned char *s, int len)
|
|
+{
|
|
+ return 1;
|
|
+}
|
|
+
|
|
+#define IS_EUC_RANGE_VALID(c) ((c) >= 0xa1 && (c) <= 0xfe)
|
|
+
|
|
+static int
|
|
+pg_eucjp_verifier(const unsigned char *s, int len)
|
|
+{
|
|
+ int l;
|
|
+ unsigned char c1,
|
|
+ c2;
|
|
+
|
|
+ c1 = *s++;
|
|
+
|
|
+ switch (c1)
|
|
+ {
|
|
+ case SS2: /* JIS X 0201 */
|
|
+ l = 2;
|
|
+ if (l > len)
|
|
+ return -1;
|
|
+ c2 = *s++;
|
|
+ if (c2 < 0xa1 || c2 > 0xdf)
|
|
+ return -1;
|
|
+ break;
|
|
+
|
|
+ case SS3: /* JIS X 0212 */
|
|
+ l = 3;
|
|
+ if (l > len)
|
|
+ return -1;
|
|
+ c2 = *s++;
|
|
+ if (!IS_EUC_RANGE_VALID(c2))
|
|
+ return -1;
|
|
+ c2 = *s++;
|
|
+ if (!IS_EUC_RANGE_VALID(c2))
|
|
+ return -1;
|
|
+ break;
|
|
+
|
|
+ default:
|
|
+ if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
|
|
+ {
|
|
+ l = 2;
|
|
+ if (l > len)
|
|
+ return -1;
|
|
+ if (!IS_EUC_RANGE_VALID(c1))
|
|
+ return -1;
|
|
+ c2 = *s++;
|
|
+ if (!IS_EUC_RANGE_VALID(c2))
|
|
+ return -1;
|
|
+ }
|
|
+ else
|
|
+ /* must be ASCII */
|
|
+ {
|
|
+ l = 1;
|
|
+ }
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ return l;
|
|
+}
|
|
+
|
|
+static int
|
|
+pg_euckr_verifier(const unsigned char *s, int len)
|
|
+{
|
|
+ int l;
|
|
+ unsigned char c1,
|
|
+ c2;
|
|
+
|
|
+ c1 = *s++;
|
|
+
|
|
+ if (IS_HIGHBIT_SET(c1))
|
|
+ {
|
|
+ l = 2;
|
|
+ if (l > len)
|
|
+ return -1;
|
|
+ if (!IS_EUC_RANGE_VALID(c1))
|
|
+ return -1;
|
|
+ c2 = *s++;
|
|
+ if (!IS_EUC_RANGE_VALID(c2))
|
|
+ return -1;
|
|
+ }
|
|
+ else
|
|
+ /* must be ASCII */
|
|
+ {
|
|
+ l = 1;
|
|
+ }
|
|
+
|
|
+ return l;
|
|
+}
|
|
+
|
|
+/* EUC-CN byte sequences are exactly same as EUC-KR */
|
|
+#define pg_euccn_verifier pg_euckr_verifier
|
|
+
|
|
+static int
|
|
+pg_euctw_verifier(const unsigned char *s, int len)
|
|
+{
|
|
+ int l;
|
|
+ unsigned char c1,
|
|
+ c2;
|
|
+
|
|
+ c1 = *s++;
|
|
+
|
|
+ switch (c1)
|
|
+ {
|
|
+ case SS2: /* CNS 11643 Plane 1-7 */
|
|
+ l = 4;
|
|
+ if (l > len)
|
|
+ return -1;
|
|
+ c2 = *s++;
|
|
+ if (c2 < 0xa1 || c2 > 0xa7)
|
|
+ return -1;
|
|
+ c2 = *s++;
|
|
+ if (!IS_EUC_RANGE_VALID(c2))
|
|
+ return -1;
|
|
+ c2 = *s++;
|
|
+ if (!IS_EUC_RANGE_VALID(c2))
|
|
+ return -1;
|
|
+ break;
|
|
+
|
|
+ case SS3: /* unused */
|
|
+ return -1;
|
|
+
|
|
+ default:
|
|
+ if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
|
|
+ {
|
|
+ l = 2;
|
|
+ if (l > len)
|
|
+ return -1;
|
|
+ /* no further range check on c1? */
|
|
+ c2 = *s++;
|
|
+ if (!IS_EUC_RANGE_VALID(c2))
|
|
+ return -1;
|
|
+ }
|
|
+ else
|
|
+ /* must be ASCII */
|
|
+ {
|
|
+ l = 1;
|
|
+ }
|
|
+ break;
|
|
+ }
|
|
+ return l;
|
|
+}
|
|
+
|
|
+static int
|
|
+pg_johab_verifier(const unsigned char *s, int len)
|
|
+{
|
|
+ int l,
|
|
+ mbl;
|
|
+ unsigned char c;
|
|
+
|
|
+ l = mbl = pg_johab_mblen(s);
|
|
+
|
|
+ if (len < l)
|
|
+ return -1;
|
|
+
|
|
+ if (!IS_HIGHBIT_SET(*s))
|
|
+ return mbl;
|
|
+
|
|
+ while (--l > 0)
|
|
+ {
|
|
+ c = *++s;
|
|
+ if (!IS_EUC_RANGE_VALID(c))
|
|
+ return -1;
|
|
+ }
|
|
+ return mbl;
|
|
+}
|
|
+
|
|
+static int
|
|
+pg_mule_verifier(const unsigned char *s, int len)
|
|
+{
|
|
+ int l,
|
|
+ mbl;
|
|
+ unsigned char c;
|
|
+
|
|
+ l = mbl = pg_mule_mblen(s);
|
|
+
|
|
+ if (len < l)
|
|
+ return -1;
|
|
+
|
|
+ while (--l > 0)
|
|
+ {
|
|
+ c = *++s;
|
|
+ if (!IS_HIGHBIT_SET(c))
|
|
+ return -1;
|
|
+ }
|
|
+ return mbl;
|
|
+}
|
|
+
|
|
+static int
|
|
+pg_latin1_verifier(const unsigned char *s, int len)
|
|
+{
|
|
+ return 1;
|
|
+}
|
|
+
|
|
+static int
|
|
+pg_sjis_verifier(const unsigned char *s, int len)
|
|
+{
|
|
+ int l,
|
|
+ mbl;
|
|
+ unsigned char c1,
|
|
+ c2;
|
|
+
|
|
+ l = mbl = pg_sjis_mblen(s);
|
|
+
|
|
+ if (len < l)
|
|
+ return -1;
|
|
+
|
|
+ if (l == 1) /* pg_sjis_mblen already verified it */
|
|
+ return mbl;
|
|
+
|
|
+ c1 = *s++;
|
|
+ c2 = *s;
|
|
+ if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
|
|
+ return -1;
|
|
+ return mbl;
|
|
+}
|
|
+
|
|
+static int
|
|
+pg_big5_verifier(const unsigned char *s, int len)
|
|
+{
|
|
+ int l,
|
|
+ mbl;
|
|
+
|
|
+ l = mbl = pg_big5_mblen(s);
|
|
+
|
|
+ if (len < l)
|
|
+ return -1;
|
|
+
|
|
+ if (l == 2 &&
|
|
+ s[0] == NONUTF8_INVALID_BYTE0 &&
|
|
+ s[1] == NONUTF8_INVALID_BYTE1)
|
|
+ return -1;
|
|
+
|
|
+ while (--l > 0)
|
|
+ {
|
|
+ if (*++s == '\0')
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ return mbl;
|
|
+}
|
|
+
|
|
+static int
|
|
+pg_gbk_verifier(const unsigned char *s, int len)
|
|
+{
|
|
+ int l,
|
|
+ mbl;
|
|
+
|
|
+ l = mbl = pg_gbk_mblen(s);
|
|
+
|
|
+ if (len < l)
|
|
+ return -1;
|
|
+
|
|
+ if (l == 2 &&
|
|
+ s[0] == NONUTF8_INVALID_BYTE0 &&
|
|
+ s[1] == NONUTF8_INVALID_BYTE1)
|
|
+ return -1;
|
|
+
|
|
+ while (--l > 0)
|
|
+ {
|
|
+ if (*++s == '\0')
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ return mbl;
|
|
+}
|
|
+
|
|
+static int
|
|
+pg_uhc_verifier(const unsigned char *s, int len)
|
|
+{
|
|
+ int l,
|
|
+ mbl;
|
|
+
|
|
+ l = mbl = pg_uhc_mblen(s);
|
|
+
|
|
+ if (len < l)
|
|
+ return -1;
|
|
+
|
|
+ if (l == 2 &&
|
|
+ s[0] == NONUTF8_INVALID_BYTE0 &&
|
|
+ s[1] == NONUTF8_INVALID_BYTE1)
|
|
+ return -1;
|
|
+
|
|
+ while (--l > 0)
|
|
+ {
|
|
+ if (*++s == '\0')
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ return mbl;
|
|
+}
|
|
+
|
|
+static int
|
|
+pg_gb18030_verifier(const unsigned char *s, int len)
|
|
+{
|
|
+ int l;
|
|
+
|
|
+ if (!IS_HIGHBIT_SET(*s))
|
|
+ l = 1; /* ASCII */
|
|
+ else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
|
|
+ {
|
|
+ /* Should be 4-byte, validate remaining bytes */
|
|
+ if (*s >= 0x81 && *s <= 0xfe &&
|
|
+ *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
|
|
+ *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
|
|
+ l = 4;
|
|
+ else
|
|
+ l = -1;
|
|
+ }
|
|
+ else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
|
|
+ {
|
|
+ /* Should be 2-byte, validate */
|
|
+ if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
|
|
+ (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
|
|
+ l = 2;
|
|
+ else
|
|
+ l = -1;
|
|
+ }
|
|
+ else
|
|
+ l = -1;
|
|
+ return l;
|
|
+}
|
|
+
|
|
+static int
|
|
+pg_utf8_verifier(const unsigned char *s, int len)
|
|
+{
|
|
+ int l = pg_utf_mblen(s);
|
|
+
|
|
+ if (len < l)
|
|
+ return -1;
|
|
+
|
|
+ if (!pg_utf8_islegal(s, l))
|
|
+ return -1;
|
|
+
|
|
+ return l;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Check for validity of a single UTF-8 encoded character
|
|
+ *
|
|
+ * This directly implements the rules in RFC3629. The bizarre-looking
|
|
+ * restrictions on the second byte are meant to ensure that there isn't
|
|
+ * more than one encoding of a given Unicode character point; that is,
|
|
+ * you may not use a longer-than-necessary byte sequence with high order
|
|
+ * zero bits to represent a character that would fit in fewer bytes.
|
|
+ * To do otherwise is to create security hazards (eg, create an apparent
|
|
+ * non-ASCII character that decodes to plain ASCII).
|
|
+ *
|
|
+ * length is assumed to have been obtained by pg_utf_mblen(), and the
|
|
+ * caller must have checked that that many bytes are present in the buffer.
|
|
+ */
|
|
+bool
|
|
+pg_utf8_islegal(const unsigned char *source, int length)
|
|
+{
|
|
+ unsigned char a;
|
|
+
|
|
+ switch (length)
|
|
+ {
|
|
+ default:
|
|
+ /* reject lengths 5 and 6 for now */
|
|
+ return false;
|
|
+ case 4:
|
|
+ a = source[3];
|
|
+ if (a < 0x80 || a > 0xBF)
|
|
+ return false;
|
|
+ /* FALL THRU */
|
|
+ case 3:
|
|
+ a = source[2];
|
|
+ if (a < 0x80 || a > 0xBF)
|
|
+ return false;
|
|
+ /* FALL THRU */
|
|
+ case 2:
|
|
+ a = source[1];
|
|
+ switch (*source)
|
|
+ {
|
|
+ case 0xE0:
|
|
+ if (a < 0xA0 || a > 0xBF)
|
|
+ return false;
|
|
+ break;
|
|
+ case 0xED:
|
|
+ if (a < 0x80 || a > 0x9F)
|
|
+ return false;
|
|
+ break;
|
|
+ case 0xF0:
|
|
+ if (a < 0x90 || a > 0xBF)
|
|
+ return false;
|
|
+ break;
|
|
+ case 0xF4:
|
|
+ if (a < 0x80 || a > 0x8F)
|
|
+ return false;
|
|
+ break;
|
|
+ default:
|
|
+ if (a < 0x80 || a > 0xBF)
|
|
+ return false;
|
|
+ break;
|
|
+ }
|
|
+ /* FALL THRU */
|
|
+ case 1:
|
|
+ a = *source;
|
|
+ if (a >= 0x80 && a < 0xC2)
|
|
+ return false;
|
|
+ if (a > 0xF4)
|
|
+ return false;
|
|
+ break;
|
|
+ }
|
|
+ return true;
|
|
+}
|
|
+
|
|
+
|
|
+/*
|
|
+ * Fills the provided buffer with two bytes such that:
|
|
+ * pg_encoding_mblen(dst) == 2 && pg_encoding_verifymbstr(dst) == 0
|
|
+ */
|
|
+void
|
|
+pg_encoding_set_invalid(int encoding, char *dst)
|
|
+{
|
|
+ Assert(pg_encoding_max_length(encoding) > 1);
|
|
+
|
|
+ dst[0] = (encoding == PG_UTF8 ? 0xc0 : NONUTF8_INVALID_BYTE0);
|
|
+ dst[1] = NONUTF8_INVALID_BYTE1;
|
|
+}
|
|
+
|
|
+/*
|
|
+ *-------------------------------------------------------------------
|
|
+ * encoding info table
|
|
+ * XXX must be sorted by the same order as enum pg_enc (in mb/pg_wchar.h)
|
|
+ *-------------------------------------------------------------------
|
|
+ */
|
|
+const pg_wchar_tbl pg_wchar_table[] = {
|
|
+ {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifier, 1}, /* PG_SQL_ASCII */
|
|
+ {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3}, /* PG_EUC_JP */
|
|
+ {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifier, 2}, /* PG_EUC_CN */
|
|
+ {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifier, 3}, /* PG_EUC_KR */
|
|
+ {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifier, 4}, /* PG_EUC_TW */
|
|
+ {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3}, /* PG_EUC_JIS_2004 */
|
|
+ {pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifier, 4}, /* PG_UTF8 */
|
|
+ {pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifier, 4}, /* PG_MULE_INTERNAL */
|
|
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN1 */
|
|
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN2 */
|
|
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN3 */
|
|
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN4 */
|
|
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN5 */
|
|
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN6 */
|
|
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN7 */
|
|
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN8 */
|
|
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN9 */
|
|
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN10 */
|
|
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1256 */
|
|
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1258 */
|
|
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN866 */
|
|
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN874 */
|
|
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_KOI8R */
|
|
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1251 */
|
|
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1252 */
|
|
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-5 */
|
|
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-6 */
|
|
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-7 */
|
|
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-8 */
|
|
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1250 */
|
|
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1253 */
|
|
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1254 */
|
|
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1255 */
|
|
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1257 */
|
|
+ {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_KOI8U */
|
|
+ {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2}, /* PG_SJIS */
|
|
+ {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifier, 2}, /* PG_BIG5 */
|
|
+ {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifier, 2}, /* PG_GBK */
|
|
+ {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifier, 2}, /* PG_UHC */
|
|
+ {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifier, 4}, /* PG_GB18030 */
|
|
+ {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifier, 3}, /* PG_JOHAB */
|
|
+ {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2} /* PG_SHIFT_JIS_2004 */
|
|
+};
|
|
+
|
|
+/*
|
|
+ * Returns the byte length of a multibyte character.
|
|
+ *
|
|
+ * Caution: when dealing with text that is not certainly valid in the
|
|
+ * specified encoding, the result may exceed the actual remaining
|
|
+ * string length. Callers that are not prepared to deal with that
|
|
+ * should use pg_encoding_mblen_bounded() instead.
|
|
+ */
|
|
+int
|
|
+pg_encoding_mblen(int encoding, const char *mbstr)
|
|
+{
|
|
+ return (PG_VALID_ENCODING(encoding) ?
|
|
+ pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
|
|
+ pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Returns the byte length of a multibyte character; but not more than
|
|
+ * the distance to end of string.
|
|
+ */
|
|
+int
|
|
+pg_encoding_mblen_bounded(int encoding, const char *mbstr)
|
|
+{
|
|
+ return strnlen(mbstr, pg_encoding_mblen(encoding, mbstr));
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Returns the display length of a multibyte character.
|
|
+ */
|
|
+int
|
|
+pg_encoding_dsplen(int encoding, const char *mbstr)
|
|
+{
|
|
+ return (PG_VALID_ENCODING(encoding) ?
|
|
+ pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
|
|
+ pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Verify the first multibyte character of the given string.
|
|
+ * Return its byte length if good, -1 if bad. (See comments above for
|
|
+ * full details of the mbverify API.)
|
|
+ */
|
|
+int
|
|
+pg_encoding_verifymb(int encoding, const char *mbstr, int len)
|
|
+{
|
|
+ return (PG_VALID_ENCODING(encoding) ?
|
|
+ pg_wchar_table[encoding].mbverify((const unsigned char *) mbstr, len) :
|
|
+ pg_wchar_table[PG_SQL_ASCII].mbverify((const unsigned char *) mbstr, len));
|
|
+}
|
|
+
|
|
+/* v14+ function name, for easier backpatching */
|
|
+int
|
|
+pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
|
|
+{
|
|
+ int ok_bytes = pg_encoding_verifymb(encoding, mbstr, len);
|
|
+
|
|
+ if (ok_bytes == 0)
|
|
+ return -1;
|
|
+ return ok_bytes;
|
|
+}
|
|
+
|
|
+/* replace v14+ function, adapted from pg_verify_mbstr_len */
|
|
+int
|
|
+pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
|
|
+{
|
|
+ mbverifier mbverify;
|
|
+ int ok_bytes;
|
|
+
|
|
+ Assert(PG_VALID_ENCODING(encoding));
|
|
+
|
|
+ /*
|
|
+ * In single-byte encodings, we need only reject nulls (\0).
|
|
+ */
|
|
+ if (pg_encoding_max_length(encoding) <= 1)
|
|
+ {
|
|
+ const char *nullpos = memchr(mbstr, 0, len);
|
|
+
|
|
+ if (nullpos == NULL)
|
|
+ return len;
|
|
+ return nullpos - mbstr;
|
|
+ }
|
|
+
|
|
+ /* fetch function pointer just once */
|
|
+ mbverify = pg_wchar_table[encoding].mbverify;
|
|
+
|
|
+ ok_bytes = 0;
|
|
+
|
|
+ while (len > 0)
|
|
+ {
|
|
+ int l;
|
|
+
|
|
+ /* fast path for ASCII-subset characters */
|
|
+ if (!IS_HIGHBIT_SET(*mbstr))
|
|
+ {
|
|
+ if (*mbstr != '\0')
|
|
+ {
|
|
+ ok_bytes++;
|
|
+ mbstr++;
|
|
+ len--;
|
|
+ continue;
|
|
+ }
|
|
+ return ok_bytes;
|
|
+ }
|
|
+
|
|
+ l = (*mbverify) ((const unsigned char *) mbstr, len);
|
|
+
|
|
+ if (l < 0)
|
|
+ return ok_bytes;
|
|
+
|
|
+ mbstr += l;
|
|
+ len -= l;
|
|
+ ok_bytes += l;
|
|
+ }
|
|
+ return ok_bytes;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * fetch maximum length of a given encoding
|
|
+ */
|
|
+int
|
|
+pg_encoding_max_length(int encoding)
|
|
+{
|
|
+ Assert(PG_VALID_ENCODING(encoding));
|
|
+
|
|
+ /*
|
|
+ * Check for the encoding despite the assert, due to some mingw versions
|
|
+ * otherwise issuing bogus warnings.
|
|
+ */
|
|
+ return PG_VALID_ENCODING(encoding) ?
|
|
+ pg_wchar_table[encoding].maxmblen :
|
|
+ pg_wchar_table[PG_SQL_ASCII].maxmblen;
|
|
+}
|
|
diff --git a/src/include/common/unicode_combining_table.h b/src/include/common/unicode_combining_table.h
|
|
new file mode 100644
|
|
index 00000000000..a9f10c31bc8
|
|
--- /dev/null
|
|
+++ b/src/include/common/unicode_combining_table.h
|
|
@@ -0,0 +1,196 @@
|
|
+/* generated by src/common/unicode/generate-unicode_combining_table.pl, do not edit */
|
|
+
|
|
+static const struct mbinterval combining[] = {
|
|
+ {0x0300, 0x036F},
|
|
+ {0x0483, 0x0489},
|
|
+ {0x0591, 0x05BD},
|
|
+ {0x05BF, 0x05BF},
|
|
+ {0x05C1, 0x05C2},
|
|
+ {0x05C4, 0x05C5},
|
|
+ {0x05C7, 0x05C7},
|
|
+ {0x0610, 0x061A},
|
|
+ {0x064B, 0x065F},
|
|
+ {0x0670, 0x0670},
|
|
+ {0x06D6, 0x06DC},
|
|
+ {0x06DF, 0x06E4},
|
|
+ {0x06E7, 0x06E8},
|
|
+ {0x06EA, 0x06ED},
|
|
+ {0x0711, 0x0711},
|
|
+ {0x0730, 0x074A},
|
|
+ {0x07A6, 0x07B0},
|
|
+ {0x07EB, 0x07F3},
|
|
+ {0x07FD, 0x07FD},
|
|
+ {0x0816, 0x0819},
|
|
+ {0x081B, 0x0823},
|
|
+ {0x0825, 0x0827},
|
|
+ {0x0829, 0x082D},
|
|
+ {0x0859, 0x085B},
|
|
+ {0x08D3, 0x08E1},
|
|
+ {0x08E3, 0x0902},
|
|
+ {0x093A, 0x093A},
|
|
+ {0x093C, 0x093C},
|
|
+ {0x0941, 0x0948},
|
|
+ {0x094D, 0x094D},
|
|
+ {0x0951, 0x0957},
|
|
+ {0x0962, 0x0963},
|
|
+ {0x0981, 0x0981},
|
|
+ {0x09BC, 0x09BC},
|
|
+ {0x09C1, 0x09C4},
|
|
+ {0x09CD, 0x09CD},
|
|
+ {0x09E2, 0x09E3},
|
|
+ {0x09FE, 0x0A02},
|
|
+ {0x0A3C, 0x0A3C},
|
|
+ {0x0A41, 0x0A51},
|
|
+ {0x0A70, 0x0A71},
|
|
+ {0x0A75, 0x0A75},
|
|
+ {0x0A81, 0x0A82},
|
|
+ {0x0ABC, 0x0ABC},
|
|
+ {0x0AC1, 0x0AC8},
|
|
+ {0x0ACD, 0x0ACD},
|
|
+ {0x0AE2, 0x0AE3},
|
|
+ {0x0AFA, 0x0B01},
|
|
+ {0x0B3C, 0x0B3C},
|
|
+ {0x0B3F, 0x0B3F},
|
|
+ {0x0B41, 0x0B44},
|
|
+ {0x0B4D, 0x0B56},
|
|
+ {0x0B62, 0x0B63},
|
|
+ {0x0B82, 0x0B82},
|
|
+ {0x0BC0, 0x0BC0},
|
|
+ {0x0BCD, 0x0BCD},
|
|
+ {0x0C00, 0x0C00},
|
|
+ {0x0C04, 0x0C04},
|
|
+ {0x0C3E, 0x0C40},
|
|
+ {0x0C46, 0x0C56},
|
|
+ {0x0C62, 0x0C63},
|
|
+ {0x0C81, 0x0C81},
|
|
+ {0x0CBC, 0x0CBC},
|
|
+ {0x0CBF, 0x0CBF},
|
|
+ {0x0CC6, 0x0CC6},
|
|
+ {0x0CCC, 0x0CCD},
|
|
+ {0x0CE2, 0x0CE3},
|
|
+ {0x0D00, 0x0D01},
|
|
+ {0x0D3B, 0x0D3C},
|
|
+ {0x0D41, 0x0D44},
|
|
+ {0x0D4D, 0x0D4D},
|
|
+ {0x0D62, 0x0D63},
|
|
+ {0x0D81, 0x0D81},
|
|
+ {0x0DCA, 0x0DCA},
|
|
+ {0x0DD2, 0x0DD6},
|
|
+ {0x0E31, 0x0E31},
|
|
+ {0x0E34, 0x0E3A},
|
|
+ {0x0E47, 0x0E4E},
|
|
+ {0x0EB1, 0x0EB1},
|
|
+ {0x0EB4, 0x0EBC},
|
|
+ {0x0EC8, 0x0ECD},
|
|
+ {0x0F18, 0x0F19},
|
|
+ {0x0F35, 0x0F35},
|
|
+ {0x0F37, 0x0F37},
|
|
+ {0x0F39, 0x0F39},
|
|
+ {0x0F71, 0x0F7E},
|
|
+ {0x0F80, 0x0F84},
|
|
+ {0x0F86, 0x0F87},
|
|
+ {0x0F8D, 0x0FBC},
|
|
+ {0x0FC6, 0x0FC6},
|
|
+ {0x102D, 0x1030},
|
|
+ {0x1032, 0x1037},
|
|
+ {0x1039, 0x103A},
|
|
+ {0x103D, 0x103E},
|
|
+ {0x1058, 0x1059},
|
|
+ {0x105E, 0x1060},
|
|
+ {0x1071, 0x1074},
|
|
+ {0x1082, 0x1082},
|
|
+ {0x1085, 0x1086},
|
|
+ {0x108D, 0x108D},
|
|
+ {0x109D, 0x109D},
|
|
+ {0x135D, 0x135F},
|
|
+ {0x1712, 0x1714},
|
|
+ {0x1732, 0x1734},
|
|
+ {0x1752, 0x1753},
|
|
+ {0x1772, 0x1773},
|
|
+ {0x17B4, 0x17B5},
|
|
+ {0x17B7, 0x17BD},
|
|
+ {0x17C6, 0x17C6},
|
|
+ {0x17C9, 0x17D3},
|
|
+ {0x17DD, 0x17DD},
|
|
+ {0x180B, 0x180D},
|
|
+ {0x1885, 0x1886},
|
|
+ {0x18A9, 0x18A9},
|
|
+ {0x1920, 0x1922},
|
|
+ {0x1927, 0x1928},
|
|
+ {0x1932, 0x1932},
|
|
+ {0x1939, 0x193B},
|
|
+ {0x1A17, 0x1A18},
|
|
+ {0x1A1B, 0x1A1B},
|
|
+ {0x1A56, 0x1A56},
|
|
+ {0x1A58, 0x1A60},
|
|
+ {0x1A62, 0x1A62},
|
|
+ {0x1A65, 0x1A6C},
|
|
+ {0x1A73, 0x1A7F},
|
|
+ {0x1AB0, 0x1B03},
|
|
+ {0x1B34, 0x1B34},
|
|
+ {0x1B36, 0x1B3A},
|
|
+ {0x1B3C, 0x1B3C},
|
|
+ {0x1B42, 0x1B42},
|
|
+ {0x1B6B, 0x1B73},
|
|
+ {0x1B80, 0x1B81},
|
|
+ {0x1BA2, 0x1BA5},
|
|
+ {0x1BA8, 0x1BA9},
|
|
+ {0x1BAB, 0x1BAD},
|
|
+ {0x1BE6, 0x1BE6},
|
|
+ {0x1BE8, 0x1BE9},
|
|
+ {0x1BED, 0x1BED},
|
|
+ {0x1BEF, 0x1BF1},
|
|
+ {0x1C2C, 0x1C33},
|
|
+ {0x1C36, 0x1C37},
|
|
+ {0x1CD0, 0x1CD2},
|
|
+ {0x1CD4, 0x1CE0},
|
|
+ {0x1CE2, 0x1CE8},
|
|
+ {0x1CED, 0x1CED},
|
|
+ {0x1CF4, 0x1CF4},
|
|
+ {0x1CF8, 0x1CF9},
|
|
+ {0x1DC0, 0x1DFF},
|
|
+ {0x20D0, 0x20F0},
|
|
+ {0x2CEF, 0x2CF1},
|
|
+ {0x2D7F, 0x2D7F},
|
|
+ {0x2DE0, 0x2DFF},
|
|
+ {0x302A, 0x302D},
|
|
+ {0x3099, 0x309A},
|
|
+ {0xA66F, 0xA672},
|
|
+ {0xA674, 0xA67D},
|
|
+ {0xA69E, 0xA69F},
|
|
+ {0xA6F0, 0xA6F1},
|
|
+ {0xA802, 0xA802},
|
|
+ {0xA806, 0xA806},
|
|
+ {0xA80B, 0xA80B},
|
|
+ {0xA825, 0xA826},
|
|
+ {0xA82C, 0xA82C},
|
|
+ {0xA8C4, 0xA8C5},
|
|
+ {0xA8E0, 0xA8F1},
|
|
+ {0xA8FF, 0xA8FF},
|
|
+ {0xA926, 0xA92D},
|
|
+ {0xA947, 0xA951},
|
|
+ {0xA980, 0xA982},
|
|
+ {0xA9B3, 0xA9B3},
|
|
+ {0xA9B6, 0xA9B9},
|
|
+ {0xA9BC, 0xA9BD},
|
|
+ {0xA9E5, 0xA9E5},
|
|
+ {0xAA29, 0xAA2E},
|
|
+ {0xAA31, 0xAA32},
|
|
+ {0xAA35, 0xAA36},
|
|
+ {0xAA43, 0xAA43},
|
|
+ {0xAA4C, 0xAA4C},
|
|
+ {0xAA7C, 0xAA7C},
|
|
+ {0xAAB0, 0xAAB0},
|
|
+ {0xAAB2, 0xAAB4},
|
|
+ {0xAAB7, 0xAAB8},
|
|
+ {0xAABE, 0xAABF},
|
|
+ {0xAAC1, 0xAAC1},
|
|
+ {0xAAEC, 0xAAED},
|
|
+ {0xAAF6, 0xAAF6},
|
|
+ {0xABE5, 0xABE5},
|
|
+ {0xABE8, 0xABE8},
|
|
+ {0xABED, 0xABED},
|
|
+ {0xFB1E, 0xFB1E},
|
|
+ {0xFE00, 0xFE0F},
|
|
+ {0xFE20, 0xFE2F},
|
|
+};
|
|
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
|
|
index 07b316fae1d..2bbdf2e792f 100644
|
|
--- a/src/include/mb/pg_wchar.h
|
|
+++ b/src/include/mb/pg_wchar.h
|
|
@@ -521,6 +521,10 @@ extern int pg_valid_server_encoding_id(int encoding);
|
|
* of them do exist inside libpq.
|
|
*/
|
|
extern void pg_encoding_set_invalid(int encoding, char *dst);
|
|
+extern int pg_encoding_verifymbchar(int encoding, const char *mbstr, int len);
|
|
+extern int pg_encoding_verifymbstr(int encoding, const char *mbstr, int len);
|
|
+extern int pg_encoding_verifymb(int encoding, const char *mbstr, int len);
|
|
+
|
|
extern int pg_mb2wchar(const char *from, pg_wchar *to);
|
|
extern int pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len);
|
|
extern int pg_encoding_mb2wchar_with_len(int encoding,
|
|
diff --git a/src/interfaces/libpq/fe-exec.c b/src/interfaces/libpq/fe-exec.c
|
|
index a29d19a6268..dccee6a5597 100644
|
|
--- a/src/interfaces/libpq/fe-exec.c
|
|
+++ b/src/interfaces/libpq/fe-exec.c
|
|
@@ -132,6 +132,8 @@ static int check_field_number(const PGresult *res, int field_num);
|
|
#define PGRESULT_SEP_ALLOC_THRESHOLD (PGRESULT_DATA_BLOCKSIZE / 2)
|
|
|
|
|
|
+
|
|
+
|
|
/*
|
|
* PQmakeEmptyPGresult
|
|
* returns a newly allocated, initialized PGresult with given status.
|
|
@@ -3403,9 +3405,10 @@ PQescapeStringInternal(PGconn *conn,
|
|
if (error)
|
|
*error = 1;
|
|
if (conn)
|
|
- libpq_append_conn_error(conn, "incomplete multibyte character");
|
|
+ printfPQExpBuffer(&conn->errorMessage,
|
|
+ libpq_gettext("incomplete multibyte character\n"));
|
|
|
|
- pg_encoding_set_invalid(encoding, target);
|
|
+ pg_encoding_set_invalid(encoding, target);
|
|
target += 2;
|
|
source++;
|
|
remaining--;
|
|
--
|
|
2.39.5 (Apple Git-154)
|
|
|
|
|
|
From 27827fe62777a809cc3f5a54742839bc031b02f6 Mon Sep 17 00:00:00 2001
|
|
From: Filip Janus <fjanus@redhat.com>
|
|
Date: Tue, 18 Mar 2025 10:11:09 +0100
|
|
Subject: [PATCH 8/8] Fix failing dropdb.c
|
|
|
|
---
|
|
src/bin/scripts/dropdb.c | 4 ----
|
|
1 files changed, 0 insertion(+), 5 deletions(-)
|
|
|
|
diff --git a/src/bin/scripts/dropdb.c b/src/bin/scripts/dropdb.c
|
|
index ed3a2c8c19a..140982717d9 100644
|
|
--- a/src/bin/scripts/dropdb.c
|
|
+++ b/src/bin/scripts/dropdb.c
|
|
@@ -122,10 +122,6 @@ main(int argc, char *argv[])
|
|
exit(0);
|
|
}
|
|
|
|
- initPQExpBuffer(&sql);
|
|
-
|
|
- appendPQExpBuffer(&sql, "DROP DATABASE %s%s;",
|
|
- (if_exists ? "IF EXISTS " : ""), fmtIdEnc(dbname, PQclientEncoding(conn)));
|
|
|
|
/* Avoid trying to drop postgres db while we are connected to it. */
|
|
if (maintenance_db == NULL && strcmp(dbname, "postgres") == 0)
|
|
2.39.5 (Apple Git-154)
|
|
|