根据PostgreSQL的手册,受字符分类(LC_CTYPE)影响的几个功能,有以下几个。
- upper, lower, initcap
- 大小写不敏感的模式匹配
- 使用了字符分类的正则表达式匹配
下面做一些测试。测试环境为CentOS 6.5 + PostgreSQL 9.3。
PostgreSQL的LC_CTYPE值可以在initdb或createdb时指定,也可以通过collate(实际是LC_COLLATE+LC_CTYPE的组合)在建表或SQL的表达式中指定。下面的测试,使用表达式指定LC_CTYPE。
LC_CTYPE为C时,不能识别全角英文字母。
-
postgres=# select upper('a' collate "C" );
-
a
-
-
postgres=# select lower('A' collate "C" );
-
A
-
-
postgres=# select initcap('aaa' collate "C" );
-
aaa
-
-
postgres=# select 'a' ilike 'A' collate "C";
- f
LC_CTYPE为zh_CN时,可以识别全角英文字母 。
-
postgres=# select upper('a' collate "zh_CN");
-
A
-
-
postgres=# select lower('A' collate "zh_CN");
-
a
-
-
postgres=# select initcap('aaa' collate "zh_CN");
-
Aaa
-
-
postgres=# select 'a' ilike 'A' collate "zh_CN";
- t
然而对正则表达式的字符分类,不论区域是什么都不识别全角英文字母。
-
postgres=# select 'A' collate "C" ~ '[[:upper:]]';
-
f
-
-
postgres=# select '1' collate "C" ~ '[[:alnum:]]';
-
f
-
-
postgres=# select 'A' collate "zh_CN" ~ '[[:upper:]]';
-
f
-
-
postgres=# select '1' collate "zh_CN" ~ '[[:alnum:]]';
- f
但是OS是支持的。
-
[chenhj@hanode1 ~]$ export LC_ALL=C
-
[chenhj@hanode1 ~]$ echo 'A' |grep '[[:upper:]]';
-
[chenhj@hanode1 ~]$ echo '1' |grep '[[:alnum:]]';
-
[chenhj@hanode1 ~]$ export LC_ALL=zh_CN.utf8
-
[chenhj@hanode1 ~]$ echo 'A' |grep '[[:upper:]]';
-
A
-
[chenhj@hanode1 ~]$ echo '1' |grep '[[:alnum:]]';
- 1
为什么会这样?
查看了PostgreSQL中正则表达式实现的代码。原来PostgreSQL中为了确保性能预先把字符分类属性都计算好了缓存起来的。而缓存的字符有限,最多也就缓存pg_wchar值(UTF编码时pg_wchar值相当于unicode代码点)是0~0x7FF的字符,其他的字符都认为不匹配。
src/backend/regex/regc_pg_locale.c
-
pg_ctype_get_cache(pg_wc_probefunc probefunc)
-
{
-
case PG_REGEX_LOCALE_WIDE:
-
case PG_REGEX_LOCALE_WIDE_L:
-
max_chr = (pg_wchar) 0x7FF;
-
...
-
for (cur_chr = 0; cur_chr <= max_chr; cur_chr++)
-
{
-
if ((*probefunc) (cur_chr))
-
nmatches++;
-
else if (nmatches > 0)
-
{
-
if (!store_match(pcc, cur_chr - nmatches, nmatches))
-
goto out_of_memory;
-
nmatches = 0;
-
}
-
}
-
...
- }
src/backend/utils/mb/wchar.c
-
/*
-
* convert UTF8 string to pg_wchar (UCS-4)
-
* caller must allocate enough space for "to", including a trailing
-
* len: length of from.
-
* "from" not necessarily null terminated.
-
*/
-
static int
-
pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
-
{
-
int cnt = 0;
-
uint32 c1,
-
c2,
-
c3,
-
c4;
-
-
while (len > 0 && *from)
-
{
-
if ((*from & 0x80) == 0)
-
{
-
*to = *from++;
-
len--;
-
}
-
else if ((*from & 0xe0) == 0xc0)
-
{
-
if (len < 2)
-
break; /* drop trailing incomplete char */
-
c1 = *from++ & 0x1f;
-
c2 = *from++ & 0x3f;
-
*to = (c1 << 6) | c2;
-
len -= 2;
-
}
-
else if ((*from & 0xf0) == 0xe0)
-
{
-
if (len < 3)
-
break; /* drop trailing incomplete char */
-
c1 = *from++ & 0x0f;
-
c2 = *from++ & 0x3f;
-
c3 = *from++ & 0x3f;
-
*to = (c1 << 12) | (c2 << 6) | c3;
-
len -= 3;
-
}
-
else if ((*from & 0xf8) == 0xf0)
-
{
-
if (len < 4)
-
break; /* drop trailing incomplete char */
-
c1 = *from++ & 0x07;
-
c2 = *from++ & 0x3f;
-
c3 = *from++ & 0x3f;
-
c4 = *from++ & 0x3f;
-
*to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
-
len -= 4;
-
}
-
else
-
{
-
/* treat a bogus char as length 1; not ours to raise error */
-
*to = *from++;
-
len--;
-
}
-
to++;
-
cnt++;
-
}
-
*to = 0;
-
return cnt;
- }
而全角英文的unicode代码点是超过0x7FF的。
/usr/share/i18n/locales/i18n
-
...
-
upper /
-
...
-
% HALFWIDTH AND FULLWIDTH FORMS/
- <UFF21>..<UFF3A>
再试一下0x7FF以内的某个字符,发现确实是支持的。
比如下面的'?'(0xc5)

话说这些看上去很古怪的字符,会有人在中文里用吗?