926 lines
20 KiB
Plaintext
926 lines
20 KiB
Plaintext
|
|
#define PERL_NO_GET_CONTEXT /* we want efficiency */
|
|
|
|
/* private functions which need pTHX_ and aTHX_
|
|
pv_cat_decompHangul
|
|
sv_2pvunicode
|
|
pv_utf8_decompose
|
|
pv_utf8_reorder
|
|
pv_utf8_compose
|
|
*/
|
|
|
|
#include "EXTERN.h"
|
|
#include "perl.h"
|
|
#include "XSUB.h"
|
|
|
|
/* These 5 files are prepared by mkheader */
|
|
#include "unfcmb.h"
|
|
#include "unfcan.h"
|
|
#include "unfcpt.h"
|
|
#include "unfcmp.h"
|
|
#include "unfexc.h"
|
|
|
|
/* The generated normalization tables since v5.20 are in native character set
|
|
* terms. Prior to that, they were in Unicode terms. So we use 'uvchr' for
|
|
* later perls, and redefine that to be 'uvuni' for earlier ones */
|
|
#if PERL_VERSION < 20
|
|
# undef uvchr_to_utf8
|
|
# ifdef uvuni_to_utf8
|
|
# define uvchr_to_utf8 uvuni_to_utf8
|
|
# else /* Perl 5.6.1 */
|
|
# define uvchr_to_utf8 uv_to_utf8
|
|
# endif
|
|
|
|
# undef utf8n_to_uvchr
|
|
# ifdef utf8n_to_uvuni
|
|
# define utf8n_to_uvchr utf8n_to_uvuni
|
|
# else /* Perl 5.6.1 */
|
|
# define utf8n_to_uvchr utf8_to_uv
|
|
# endif
|
|
#endif
|
|
|
|
/* UTF8_ALLOW_BOM is used before Perl 5.8.0 */
|
|
#ifndef UTF8_ALLOW_BOM
|
|
#define UTF8_ALLOW_BOM (0)
|
|
#endif /* UTF8_ALLOW_BOM */
|
|
|
|
#ifndef UTF8_ALLOW_SURROGATE
|
|
#define UTF8_ALLOW_SURROGATE (0)
|
|
#endif /* UTF8_ALLOW_SURROGATE */
|
|
|
|
#ifndef UTF8_ALLOW_FE_FF
|
|
#define UTF8_ALLOW_FE_FF (0)
|
|
#endif /* UTF8_ALLOW_FE_FF */
|
|
|
|
#ifndef UTF8_ALLOW_FFFF
|
|
#define UTF8_ALLOW_FFFF (0)
|
|
#endif /* UTF8_ALLOW_FFFF */
|
|
|
|
#ifndef PERL_UNUSED_VAR
|
|
# define PERL_UNUSED_VAR(x) ((void)sizeof(x))
|
|
#endif
|
|
|
|
#define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FE_FF|UTF8_ALLOW_FFFF)
|
|
|
|
/* check if the string buffer is enough before uvchr_to_utf8(). */
|
|
/* dstart, d, and dlen should be defined outside before. */
|
|
#define Renew_d_if_not_enough_to(need) STRLEN curlen = d - dstart; \
|
|
if (dlen < curlen + (need)) { \
|
|
dlen += (need); \
|
|
Renew(dstart, dlen+1, U8); \
|
|
d = dstart + curlen; \
|
|
}
|
|
|
|
/* if utf8n_to_uvchr() sets retlen to 0 (if broken?) */
|
|
#define ErrRetlenIsZero "panic (Unicode::Normalize %s): zero-length character"
|
|
|
|
/* utf8_hop() hops back before start. Maybe broken UTF-8 */
|
|
#define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start"
|
|
|
|
/* At present, char > 0x10ffff are unaffected without complaint, right? */
|
|
#define VALID_UTF_MAX (0x10ffff)
|
|
#define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))
|
|
|
|
/* size of array for combining characters */
|
|
/* enough as an initial value? */
|
|
#define CC_SEQ_SIZE (10)
|
|
#define CC_SEQ_STEP (5)
|
|
|
|
/* HANGUL begin */
|
|
#define Hangul_SBase 0xAC00
|
|
#define Hangul_SFinal 0xD7A3
|
|
#define Hangul_SCount 11172
|
|
|
|
#define Hangul_NCount 588
|
|
|
|
#define Hangul_LBase 0x1100
|
|
#define Hangul_LFinal 0x1112
|
|
#define Hangul_LCount 19
|
|
|
|
#define Hangul_VBase 0x1161
|
|
#define Hangul_VFinal 0x1175
|
|
#define Hangul_VCount 21
|
|
|
|
#define Hangul_TBase 0x11A7
|
|
#define Hangul_TFinal 0x11C2
|
|
#define Hangul_TCount 28
|
|
|
|
#define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal))
|
|
#define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0)
|
|
#define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u))
|
|
#define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal))
|
|
#define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal))
|
|
#define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal))
|
|
/* HANGUL end */
|
|
|
|
/* this is used for canonical ordering of combining characters (c.c.). */
|
|
typedef struct {
|
|
U8 cc; /* combining class */
|
|
UV uv; /* codepoint */
|
|
STRLEN pos; /* position */
|
|
} UNF_cc;
|
|
|
|
static int compare_cc(const void *a, const void *b)
|
|
{
|
|
int ret_cc;
|
|
ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc;
|
|
if (ret_cc)
|
|
return ret_cc;
|
|
|
|
return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos )
|
|
- ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos );
|
|
}
|
|
|
|
static U8* dec_canonical(UV uv)
|
|
{
|
|
U8 ***plane, **row;
|
|
if (OVER_UTF_MAX(uv))
|
|
return NULL;
|
|
plane = (U8***)UNF_canon[uv >> 16];
|
|
if (! plane)
|
|
return NULL;
|
|
row = plane[(uv >> 8) & 0xff];
|
|
return row ? row[uv & 0xff] : NULL;
|
|
}
|
|
|
|
static U8* dec_compat(UV uv)
|
|
{
|
|
U8 ***plane, **row;
|
|
if (OVER_UTF_MAX(uv))
|
|
return NULL;
|
|
plane = (U8***)UNF_compat[uv >> 16];
|
|
if (! plane)
|
|
return NULL;
|
|
row = plane[(uv >> 8) & 0xff];
|
|
return row ? row[uv & 0xff] : NULL;
|
|
}
|
|
|
|
static UV composite_uv(UV uv, UV uv2)
|
|
{
|
|
UNF_complist ***plane, **row, *cell, *i;
|
|
|
|
if (!uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2))
|
|
return 0;
|
|
|
|
if (Hangul_IsL(uv) && Hangul_IsV(uv2)) {
|
|
UV lindex = uv - Hangul_LBase;
|
|
UV vindex = uv2 - Hangul_VBase;
|
|
return(Hangul_SBase + (lindex * Hangul_VCount + vindex) *
|
|
Hangul_TCount);
|
|
}
|
|
if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) {
|
|
UV tindex = uv2 - Hangul_TBase;
|
|
return(uv + tindex);
|
|
}
|
|
plane = UNF_compos[uv >> 16];
|
|
if (! plane)
|
|
return 0;
|
|
row = plane[(uv >> 8) & 0xff];
|
|
if (! row)
|
|
return 0;
|
|
cell = row[uv & 0xff];
|
|
if (! cell)
|
|
return 0;
|
|
for (i = cell; i->nextchar; i++) {
|
|
if (uv2 == i->nextchar)
|
|
return i->composite;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static U8 getCombinClass(UV uv)
|
|
{
|
|
U8 **plane, *row;
|
|
if (OVER_UTF_MAX(uv))
|
|
return 0;
|
|
plane = (U8**)UNF_combin[uv >> 16];
|
|
if (! plane)
|
|
return 0;
|
|
row = plane[(uv >> 8) & 0xff];
|
|
return row ? row[uv & 0xff] : 0;
|
|
}
|
|
|
|
static U8* pv_cat_decompHangul(pTHX_ U8* d, UV uv)
|
|
{
|
|
UV sindex = uv - Hangul_SBase;
|
|
UV lindex = sindex / Hangul_NCount;
|
|
UV vindex = (sindex % Hangul_NCount) / Hangul_TCount;
|
|
UV tindex = sindex % Hangul_TCount;
|
|
|
|
if (! Hangul_IsS(uv))
|
|
return d;
|
|
|
|
d = uvchr_to_utf8(d, (lindex + Hangul_LBase));
|
|
d = uvchr_to_utf8(d, (vindex + Hangul_VBase));
|
|
if (tindex)
|
|
d = uvchr_to_utf8(d, (tindex + Hangul_TBase));
|
|
return d;
|
|
}
|
|
|
|
static char* sv_2pvunicode(pTHX_ SV *sv, STRLEN *lp)
|
|
{
|
|
char *s;
|
|
STRLEN len;
|
|
s = SvPV(sv,len);
|
|
if (!SvUTF8(sv)) {
|
|
SV* tmpsv = sv_2mortal(newSVpvn(s, len));
|
|
if (!SvPOK(tmpsv))
|
|
s = SvPV_force(tmpsv,len);
|
|
sv_utf8_upgrade(tmpsv);
|
|
s = SvPV(tmpsv,len);
|
|
}
|
|
if (lp)
|
|
*lp = len;
|
|
return s;
|
|
}
|
|
|
|
static
|
|
U8* pv_utf8_decompose(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscompat)
|
|
{
|
|
U8* p = s;
|
|
U8* e = s + slen;
|
|
U8* dstart = *dp;
|
|
U8* d = dstart;
|
|
|
|
while (p < e) {
|
|
STRLEN retlen;
|
|
UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF);
|
|
if (!retlen)
|
|
croak(ErrRetlenIsZero, "decompose");
|
|
p += retlen;
|
|
|
|
if (Hangul_IsS(uv)) {
|
|
Renew_d_if_not_enough_to(UTF8_MAXLEN * 3)
|
|
d = pv_cat_decompHangul(aTHX_ d, uv);
|
|
}
|
|
else {
|
|
U8* r = iscompat ? dec_compat(uv) : dec_canonical(uv);
|
|
|
|
if (r) {
|
|
STRLEN len = (STRLEN)strlen((char *)r);
|
|
Renew_d_if_not_enough_to(len)
|
|
while (len--)
|
|
*d++ = *r++;
|
|
}
|
|
else {
|
|
Renew_d_if_not_enough_to(UTF8_MAXLEN)
|
|
d = uvchr_to_utf8(d, uv);
|
|
}
|
|
}
|
|
}
|
|
*dp = dstart;
|
|
return d;
|
|
}
|
|
|
|
static
|
|
U8* pv_utf8_reorder(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen)
|
|
{
|
|
U8* p = s;
|
|
U8* e = s + slen;
|
|
U8* dstart = *dp;
|
|
U8* d = dstart;
|
|
|
|
UNF_cc seq_ary[CC_SEQ_SIZE];
|
|
UNF_cc* seq_ptr = seq_ary; /* use array at the beginning */
|
|
UNF_cc* seq_ext = NULL; /* extend if need */
|
|
STRLEN seq_max = CC_SEQ_SIZE;
|
|
STRLEN cc_pos = 0;
|
|
|
|
while (p < e) {
|
|
U8 curCC;
|
|
STRLEN retlen;
|
|
UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF);
|
|
if (!retlen)
|
|
croak(ErrRetlenIsZero, "reorder");
|
|
p += retlen;
|
|
|
|
curCC = getCombinClass(uv);
|
|
|
|
if (curCC != 0) {
|
|
if (seq_max < cc_pos + 1) { /* extend if need */
|
|
seq_max = cc_pos + CC_SEQ_STEP; /* new size */
|
|
if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */
|
|
STRLEN i;
|
|
New(0, seq_ext, seq_max, UNF_cc);
|
|
for (i = 0; i < cc_pos; i++)
|
|
seq_ext[i] = seq_ary[i];
|
|
}
|
|
else {
|
|
Renew(seq_ext, seq_max, UNF_cc);
|
|
}
|
|
seq_ptr = seq_ext; /* use seq_ext from now */
|
|
}
|
|
|
|
seq_ptr[cc_pos].cc = curCC;
|
|
seq_ptr[cc_pos].uv = uv;
|
|
seq_ptr[cc_pos].pos = cc_pos;
|
|
++cc_pos;
|
|
|
|
if (p < e)
|
|
continue;
|
|
}
|
|
|
|
/* output */
|
|
if (cc_pos) {
|
|
STRLEN i;
|
|
|
|
if (cc_pos > 1) /* reordered if there are two c.c.'s */
|
|
qsort((void*)seq_ptr, cc_pos, sizeof(UNF_cc), compare_cc);
|
|
|
|
for (i = 0; i < cc_pos; i++) {
|
|
Renew_d_if_not_enough_to(UTF8_MAXLEN)
|
|
d = uvchr_to_utf8(d, seq_ptr[i].uv);
|
|
}
|
|
cc_pos = 0;
|
|
}
|
|
|
|
if (curCC == 0) {
|
|
Renew_d_if_not_enough_to(UTF8_MAXLEN)
|
|
d = uvchr_to_utf8(d, uv);
|
|
}
|
|
}
|
|
if (seq_ext)
|
|
Safefree(seq_ext);
|
|
*dp = dstart;
|
|
return d;
|
|
}
|
|
|
|
static
|
|
U8* pv_utf8_compose(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscontig)
|
|
{
|
|
U8* p = s;
|
|
U8* e = s + slen;
|
|
U8* dstart = *dp;
|
|
U8* d = dstart;
|
|
|
|
UV uvS = 0; /* code point of the starter */
|
|
bool valid_uvS = FALSE; /* if FALSE, uvS isn't initialized yet */
|
|
U8 preCC = 0;
|
|
|
|
UV seq_ary[CC_SEQ_SIZE];
|
|
UV* seq_ptr = seq_ary; /* use array at the beginning */
|
|
UV* seq_ext = NULL; /* extend if need */
|
|
STRLEN seq_max = CC_SEQ_SIZE;
|
|
STRLEN cc_pos = 0;
|
|
|
|
while (p < e) {
|
|
U8 curCC;
|
|
STRLEN retlen;
|
|
UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF);
|
|
if (!retlen)
|
|
croak(ErrRetlenIsZero, "compose");
|
|
p += retlen;
|
|
|
|
curCC = getCombinClass(uv);
|
|
|
|
if (!valid_uvS) {
|
|
if (curCC == 0) {
|
|
uvS = uv; /* the first Starter is found */
|
|
valid_uvS = TRUE;
|
|
if (p < e)
|
|
continue;
|
|
}
|
|
else {
|
|
Renew_d_if_not_enough_to(UTF8_MAXLEN)
|
|
d = uvchr_to_utf8(d, uv);
|
|
continue;
|
|
}
|
|
}
|
|
else {
|
|
bool composed;
|
|
|
|
/* blocked */
|
|
if ((iscontig && cc_pos) || /* discontiguous combination */
|
|
(curCC != 0 && preCC == curCC) || /* blocked by same CC */
|
|
(preCC > curCC)) /* blocked by higher CC: revised D2 */
|
|
composed = FALSE;
|
|
|
|
/* not blocked:
|
|
iscontig && cc_pos == 0 -- contiguous combination
|
|
curCC == 0 && preCC == 0 -- starter + starter
|
|
curCC != 0 && preCC < curCC -- lower CC */
|
|
else {
|
|
/* try composition */
|
|
UV uvComp = composite_uv(uvS, uv);
|
|
|
|
if (uvComp && !isExclusion(uvComp)) {
|
|
uvS = uvComp;
|
|
composed = TRUE;
|
|
|
|
/* preCC should not be changed to curCC */
|
|
/* e.g. 1E14 = 0045 0304 0300 where CC(0304) == CC(0300) */
|
|
if (p < e)
|
|
continue;
|
|
}
|
|
else
|
|
composed = FALSE;
|
|
}
|
|
|
|
if (!composed) {
|
|
preCC = curCC;
|
|
if (curCC != 0 || !(p < e)) {
|
|
if (seq_max < cc_pos + 1) { /* extend if need */
|
|
seq_max = cc_pos + CC_SEQ_STEP; /* new size */
|
|
if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */
|
|
New(0, seq_ext, seq_max, UV);
|
|
Copy(seq_ary, seq_ext, cc_pos, UV);
|
|
}
|
|
else {
|
|
Renew(seq_ext, seq_max, UV);
|
|
}
|
|
seq_ptr = seq_ext; /* use seq_ext from now */
|
|
}
|
|
seq_ptr[cc_pos] = uv;
|
|
++cc_pos;
|
|
}
|
|
if (curCC != 0 && p < e)
|
|
continue;
|
|
}
|
|
}
|
|
|
|
/* output */
|
|
{
|
|
Renew_d_if_not_enough_to(UTF8_MAXLEN)
|
|
d = uvchr_to_utf8(d, uvS); /* starter (composed or not) */
|
|
}
|
|
|
|
if (cc_pos) {
|
|
STRLEN i;
|
|
|
|
for (i = 0; i < cc_pos; i++) {
|
|
Renew_d_if_not_enough_to(UTF8_MAXLEN)
|
|
d = uvchr_to_utf8(d, seq_ptr[i]);
|
|
}
|
|
cc_pos = 0;
|
|
}
|
|
|
|
uvS = uv;
|
|
}
|
|
if (seq_ext)
|
|
Safefree(seq_ext);
|
|
*dp = dstart;
|
|
return d;
|
|
}
|
|
|
|
MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize
|
|
|
|
SV*
|
|
decompose(src, compat = &PL_sv_no)
|
|
SV * src
|
|
SV * compat
|
|
PROTOTYPE: $;$
|
|
PREINIT:
|
|
SV* dst;
|
|
U8 *s, *d, *dend;
|
|
STRLEN slen, dlen;
|
|
CODE:
|
|
s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
|
|
dst = newSVpvn("", 0);
|
|
dlen = slen;
|
|
New(0, d, dlen+1, U8);
|
|
dend = pv_utf8_decompose(aTHX_ s, slen, &d, dlen, (bool)SvTRUE(compat));
|
|
sv_setpvn(dst, (char *)d, dend - d);
|
|
SvUTF8_on(dst);
|
|
Safefree(d);
|
|
RETVAL = dst;
|
|
OUTPUT:
|
|
RETVAL
|
|
|
|
|
|
SV*
|
|
reorder(src)
|
|
SV * src
|
|
PROTOTYPE: $
|
|
PREINIT:
|
|
SV* dst;
|
|
U8 *s, *d, *dend;
|
|
STRLEN slen, dlen;
|
|
CODE:
|
|
s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
|
|
dst = newSVpvn("", 0);
|
|
dlen = slen;
|
|
New(0, d, dlen+1, U8);
|
|
dend = pv_utf8_reorder(aTHX_ s, slen, &d, dlen);
|
|
sv_setpvn(dst, (char *)d, dend - d);
|
|
SvUTF8_on(dst);
|
|
Safefree(d);
|
|
RETVAL = dst;
|
|
OUTPUT:
|
|
RETVAL
|
|
|
|
|
|
SV*
|
|
compose(src)
|
|
SV * src
|
|
PROTOTYPE: $
|
|
ALIAS:
|
|
composeContiguous = 1
|
|
PREINIT:
|
|
SV* dst;
|
|
U8 *s, *d, *dend;
|
|
STRLEN slen, dlen;
|
|
CODE:
|
|
s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
|
|
dst = newSVpvn("", 0);
|
|
dlen = slen;
|
|
New(0, d, dlen+1, U8);
|
|
dend = pv_utf8_compose(aTHX_ s, slen, &d, dlen, (bool)ix);
|
|
sv_setpvn(dst, (char *)d, dend - d);
|
|
SvUTF8_on(dst);
|
|
Safefree(d);
|
|
RETVAL = dst;
|
|
OUTPUT:
|
|
RETVAL
|
|
|
|
|
|
SV*
|
|
NFD(src)
|
|
SV * src
|
|
PROTOTYPE: $
|
|
ALIAS:
|
|
NFKD = 1
|
|
PREINIT:
|
|
SV *dst;
|
|
U8 *s, *t, *tend, *d, *dend;
|
|
STRLEN slen, tlen, dlen;
|
|
CODE:
|
|
s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
|
|
|
|
/* decompose */
|
|
tlen = slen;
|
|
New(0, t, tlen+1, U8);
|
|
tend = pv_utf8_decompose(aTHX_ s, slen, &t, tlen, (bool)(ix==1));
|
|
*tend = '\0';
|
|
tlen = tend - t; /* no longer know real size of t */
|
|
|
|
/* reorder */
|
|
dlen = tlen;
|
|
New(0, d, dlen+1, U8);
|
|
dend = pv_utf8_reorder(aTHX_ t, tlen, &d, dlen);
|
|
*dend = '\0';
|
|
dlen = dend - d; /* no longer know real size of d */
|
|
|
|
/* return */
|
|
dst = newSVpvn("", 0);
|
|
sv_setpvn(dst, (char *)d, dlen);
|
|
SvUTF8_on(dst);
|
|
|
|
Safefree(t);
|
|
Safefree(d);
|
|
RETVAL = dst;
|
|
OUTPUT:
|
|
RETVAL
|
|
|
|
|
|
SV*
|
|
NFC(src)
|
|
SV * src
|
|
PROTOTYPE: $
|
|
ALIAS:
|
|
NFKC = 1
|
|
FCC = 2
|
|
PREINIT:
|
|
SV *dst;
|
|
U8 *s, *t, *tend, *u, *uend, *d, *dend;
|
|
STRLEN slen, tlen, ulen, dlen;
|
|
CODE:
|
|
s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
|
|
|
|
/* decompose */
|
|
tlen = slen;
|
|
New(0, t, tlen+1, U8);
|
|
tend = pv_utf8_decompose(aTHX_ s, slen, &t, tlen, (bool)(ix==1));
|
|
*tend = '\0';
|
|
tlen = tend - t; /* no longer know real size of t */
|
|
|
|
/* reorder */
|
|
ulen = tlen;
|
|
New(0, u, ulen+1, U8);
|
|
uend = pv_utf8_reorder(aTHX_ t, tlen, &u, ulen);
|
|
*uend = '\0';
|
|
ulen = uend - u; /* no longer know real size of u */
|
|
|
|
/* compose */
|
|
dlen = ulen;
|
|
New(0, d, dlen+1, U8);
|
|
dend = pv_utf8_compose(aTHX_ u, ulen, &d, dlen, (bool)(ix==2));
|
|
*dend = '\0';
|
|
dlen = dend - d; /* no longer know real size of d */
|
|
|
|
/* return */
|
|
dst = newSVpvn("", 0);
|
|
sv_setpvn(dst, (char *)d, dlen);
|
|
SvUTF8_on(dst);
|
|
|
|
Safefree(t);
|
|
Safefree(u);
|
|
Safefree(d);
|
|
RETVAL = dst;
|
|
OUTPUT:
|
|
RETVAL
|
|
|
|
|
|
SV*
|
|
checkNFD(src)
|
|
SV * src
|
|
PROTOTYPE: $
|
|
ALIAS:
|
|
checkNFKD = 1
|
|
PREINIT:
|
|
STRLEN srclen, retlen;
|
|
U8 *s, *e, *p, curCC, preCC;
|
|
bool result = TRUE;
|
|
CODE:
|
|
s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
|
|
e = s + srclen;
|
|
|
|
preCC = 0;
|
|
for (p = s; p < e; p += retlen) {
|
|
UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF);
|
|
if (!retlen)
|
|
croak(ErrRetlenIsZero, "checkNFD or -NFKD");
|
|
|
|
curCC = getCombinClass(uv);
|
|
if (preCC > curCC && curCC != 0) { /* canonical ordering violated */
|
|
result = FALSE;
|
|
break;
|
|
}
|
|
if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) {
|
|
result = FALSE;
|
|
break;
|
|
}
|
|
preCC = curCC;
|
|
}
|
|
RETVAL = boolSV(result);
|
|
OUTPUT:
|
|
RETVAL
|
|
|
|
|
|
SV*
|
|
checkNFC(src)
|
|
SV * src
|
|
PROTOTYPE: $
|
|
ALIAS:
|
|
checkNFKC = 1
|
|
PREINIT:
|
|
STRLEN srclen, retlen;
|
|
U8 *s, *e, *p, curCC, preCC;
|
|
bool result = TRUE;
|
|
bool isMAYBE = FALSE;
|
|
CODE:
|
|
s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
|
|
e = s + srclen;
|
|
|
|
preCC = 0;
|
|
for (p = s; p < e; p += retlen) {
|
|
UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF);
|
|
if (!retlen)
|
|
croak(ErrRetlenIsZero, "checkNFC or -NFKC");
|
|
|
|
curCC = getCombinClass(uv);
|
|
if (preCC > curCC && curCC != 0) { /* canonical ordering violated */
|
|
result = FALSE;
|
|
break;
|
|
}
|
|
|
|
/* get NFC/NFKC property */
|
|
if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */
|
|
; /* YES */
|
|
else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) {
|
|
result = FALSE;
|
|
break;
|
|
}
|
|
else if (isComp2nd(uv))
|
|
isMAYBE = TRUE;
|
|
else if (ix) {
|
|
char *canon, *compat;
|
|
/* NFKC_NO when having compatibility mapping. */
|
|
canon = (char *) dec_canonical(uv);
|
|
compat = (char *) dec_compat(uv);
|
|
if (compat && !(canon && strEQ(canon, compat))) {
|
|
result = FALSE;
|
|
break;
|
|
}
|
|
} /* end of get NFC/NFKC property */
|
|
|
|
preCC = curCC;
|
|
}
|
|
if (isMAYBE && result) /* NO precedes MAYBE */
|
|
XSRETURN_UNDEF;
|
|
RETVAL = boolSV(result);
|
|
OUTPUT:
|
|
RETVAL
|
|
|
|
|
|
SV*
|
|
checkFCD(src)
|
|
SV * src
|
|
PROTOTYPE: $
|
|
ALIAS:
|
|
checkFCC = 1
|
|
PREINIT:
|
|
STRLEN srclen, retlen;
|
|
U8 *s, *e, *p, curCC, preCC;
|
|
bool result = TRUE;
|
|
bool isMAYBE = FALSE;
|
|
CODE:
|
|
s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
|
|
e = s + srclen;
|
|
preCC = 0;
|
|
for (p = s; p < e; p += retlen) {
|
|
U8 *sCan;
|
|
UV uvLead;
|
|
STRLEN canlen = 0;
|
|
UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF);
|
|
if (!retlen)
|
|
croak(ErrRetlenIsZero, "checkFCD or -FCC");
|
|
|
|
sCan = (U8*) dec_canonical(uv);
|
|
|
|
if (sCan) {
|
|
STRLEN canret;
|
|
canlen = (STRLEN)strlen((char *) sCan);
|
|
uvLead = utf8n_to_uvchr(sCan, canlen, &canret, AllowAnyUTF);
|
|
if (!canret)
|
|
croak(ErrRetlenIsZero, "checkFCD or -FCC");
|
|
}
|
|
else {
|
|
uvLead = uv;
|
|
}
|
|
|
|
curCC = getCombinClass(uvLead);
|
|
|
|
if (curCC != 0 && curCC < preCC) { /* canonical ordering violated */
|
|
result = FALSE;
|
|
break;
|
|
}
|
|
|
|
if (ix) {
|
|
if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) {
|
|
result = FALSE;
|
|
break;
|
|
}
|
|
else if (isComp2nd(uv))
|
|
isMAYBE = TRUE;
|
|
}
|
|
|
|
if (sCan) {
|
|
STRLEN canret;
|
|
UV uvTrail;
|
|
U8* eCan = sCan + canlen;
|
|
U8* pCan = utf8_hop(eCan, -1);
|
|
if (pCan < sCan)
|
|
croak(ErrHopBeforeStart);
|
|
uvTrail = utf8n_to_uvchr(pCan, eCan - pCan, &canret, AllowAnyUTF);
|
|
if (!canret)
|
|
croak(ErrRetlenIsZero, "checkFCD or -FCC");
|
|
preCC = getCombinClass(uvTrail);
|
|
}
|
|
else {
|
|
preCC = curCC;
|
|
}
|
|
}
|
|
if (isMAYBE && result) /* NO precedes MAYBE */
|
|
XSRETURN_UNDEF;
|
|
RETVAL = boolSV(result);
|
|
OUTPUT:
|
|
RETVAL
|
|
|
|
|
|
U8
|
|
getCombinClass(uv)
|
|
UV uv
|
|
PROTOTYPE: $
|
|
|
|
bool
|
|
isExclusion(uv)
|
|
UV uv
|
|
PROTOTYPE: $
|
|
|
|
bool
|
|
isSingleton(uv)
|
|
UV uv
|
|
PROTOTYPE: $
|
|
|
|
bool
|
|
isNonStDecomp(uv)
|
|
UV uv
|
|
PROTOTYPE: $
|
|
|
|
bool
|
|
isComp2nd(uv)
|
|
UV uv
|
|
PROTOTYPE: $
|
|
ALIAS:
|
|
isNFC_MAYBE = 1
|
|
isNFKC_MAYBE = 2
|
|
INIT:
|
|
PERL_UNUSED_VAR(ix);
|
|
|
|
SV*
|
|
isNFD_NO(uv)
|
|
UV uv
|
|
PROTOTYPE: $
|
|
ALIAS:
|
|
isNFKD_NO = 1
|
|
PREINIT:
|
|
bool result = FALSE;
|
|
CODE:
|
|
if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
|
|
result = TRUE; /* NFD_NO or NFKD_NO */
|
|
RETVAL = boolSV(result);
|
|
OUTPUT:
|
|
RETVAL
|
|
|
|
|
|
SV*
|
|
isComp_Ex(uv)
|
|
UV uv
|
|
PROTOTYPE: $
|
|
ALIAS:
|
|
isNFC_NO = 0
|
|
isNFKC_NO = 1
|
|
PREINIT:
|
|
bool result = FALSE;
|
|
CODE:
|
|
if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
|
|
result = TRUE; /* NFC_NO or NFKC_NO */
|
|
else if (ix) {
|
|
char *canon, *compat;
|
|
canon = (char *) dec_canonical(uv);
|
|
compat = (char *) dec_compat(uv);
|
|
if (compat && (!canon || strNE(canon, compat)))
|
|
result = TRUE; /* NFC_NO or NFKC_NO */
|
|
}
|
|
RETVAL = boolSV(result);
|
|
OUTPUT:
|
|
RETVAL
|
|
|
|
SV*
|
|
getComposite(uv, uv2)
|
|
UV uv
|
|
UV uv2
|
|
PROTOTYPE: $$
|
|
PREINIT:
|
|
UV composite;
|
|
CODE:
|
|
composite = composite_uv(uv, uv2);
|
|
RETVAL = composite ? newSVuv(composite) : &PL_sv_undef;
|
|
OUTPUT:
|
|
RETVAL
|
|
|
|
|
|
|
|
SV*
|
|
getCanon(uv)
|
|
UV uv
|
|
PROTOTYPE: $
|
|
ALIAS:
|
|
getCompat = 1
|
|
CODE:
|
|
if (Hangul_IsS(uv)) {
|
|
U8 tmp[3 * UTF8_MAXLEN + 1];
|
|
U8 *t = tmp;
|
|
U8 *e = pv_cat_decompHangul(aTHX_ t, uv);
|
|
RETVAL = newSVpvn((char *)t, e - t);
|
|
} else {
|
|
U8* rstr = ix ? dec_compat(uv) : dec_canonical(uv);
|
|
if (!rstr)
|
|
XSRETURN_UNDEF;
|
|
RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr));
|
|
}
|
|
SvUTF8_on(RETVAL);
|
|
OUTPUT:
|
|
RETVAL
|
|
|
|
|
|
void
|
|
splitOnLastStarter(src)
|
|
SV * src
|
|
PREINIT:
|
|
SV *svp;
|
|
STRLEN srclen;
|
|
U8 *s, *e, *p;
|
|
PPCODE:
|
|
s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
|
|
e = s + srclen;
|
|
p = e;
|
|
while (s < p) {
|
|
UV uv;
|
|
p = utf8_hop(p, -1);
|
|
if (p < s)
|
|
croak(ErrHopBeforeStart);
|
|
uv = utf8n_to_uvchr(p, e - p, NULL, AllowAnyUTF);
|
|
if (getCombinClass(uv) == 0) /* Last Starter found */
|
|
break;
|
|
}
|
|
|
|
svp = sv_2mortal(newSVpvn((char*)s, p - s));
|
|
SvUTF8_on(svp);
|
|
XPUSHs(svp);
|
|
|
|
svp = sv_2mortal(newSVpvn((char*)p, e - p));
|
|
SvUTF8_on(svp);
|
|
XPUSHs(svp);
|
|
|