shim.js 7.78 KB
// Taken from: https://github.com/walling/unorm/blob/master/lib/unorm.js

/*
	* UnicodeNormalizer 1.0.0
	* Copyright (c) 2008 Matsuza
	* Dual licensed under the MIT (MIT-LICENSE.txt) and
	* GPL (GPL-LICENSE.txt) licenses.
	* $Date: 2008-06-05 16:44:17 +0200 (Thu, 05 Jun 2008) $
	* $Rev: 13309 $
*/

'use strict';

var primitiveSet = require('../../../object/primitive-set')
  , validValue   = require('../../../object/valid-value')
  , data         = require('./_data')

  , floor = Math.floor
  , forms = primitiveSet('NFC', 'NFD', 'NFKC', 'NFKD')

  , DEFAULT_FEATURE = [null, 0, {}], CACHE_THRESHOLD = 10, SBase = 0xAC00
  , LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7, LCount = 19, VCount = 21
  , TCount = 28, NCount = VCount * TCount, SCount = LCount * NCount
  , UChar, cache = {}, cacheCounter = [], i, fromCache, fromData, fromCpOnly
  , fromRuleBasedJamo, fromCpFilter, strategies, UCharIterator
  , RecursDecompIterator, DecompIterator, CompIterator, createIterator
  , normalize;

UChar = function (cp, feature) {
	this.codepoint = cp;
	this.feature = feature;
};

// Strategies
for (i = 0; i <= 0xFF; ++i) cacheCounter[i] = 0;

fromCache = function (next, cp, needFeature) {
	var ret = cache[cp];
	if (!ret) {
		ret = next(cp, needFeature);
		if (!!ret.feature && ++cacheCounter[(cp >> 8) & 0xFF] > CACHE_THRESHOLD) {
			cache[cp] = ret;
		}
	}
	return ret;
};

fromData = function (next, cp, needFeature) {
	var hash = cp & 0xFF00, dunit = UChar.udata[hash] || {}, f = dunit[cp];
	return f ? new UChar(cp, f) : new UChar(cp, DEFAULT_FEATURE);
};
fromCpOnly = function (next, cp, needFeature) {
	return !!needFeature ? next(cp, needFeature) : new UChar(cp, null);
};

fromRuleBasedJamo = function (next, cp, needFeature) {
	var c, base, i, arr, SIndex, TIndex, feature, j;
	if (cp < LBase || (LBase + LCount <= cp && cp < SBase) ||
			(SBase + SCount < cp)) {
		return next(cp, needFeature);
	}
	if (LBase <= cp && cp < LBase + LCount) {
		c = {};
		base = (cp - LBase) * VCount;
		for (i = 0; i < VCount; ++i) {
			c[VBase + i] = SBase + TCount * (i + base);
		}
		arr = new Array(3);
		arr[2] = c;
		return new UChar(cp, arr);
	}

	SIndex = cp - SBase;
	TIndex = SIndex % TCount;
	feature = [];
	if (TIndex !== 0) {
		feature[0] = [SBase + SIndex - TIndex, TBase + TIndex];
	} else {
		feature[0] = [LBase + floor(SIndex / NCount), VBase +
			floor((SIndex % NCount) / TCount)];
		feature[2] = {};
		for (j = 1; j < TCount; ++j) {
			feature[2][TBase + j] = cp + j;
		}
	}
	return new UChar(cp, feature);
};

fromCpFilter = function (next, cp, needFeature) {
	return (cp < 60) || ((13311 < cp) && (cp < 42607))
		? new UChar(cp, DEFAULT_FEATURE) : next(cp, needFeature);
};

strategies = [fromCpFilter, fromCache, fromCpOnly, fromRuleBasedJamo, fromData];

UChar.fromCharCode = strategies.reduceRight(function (next, strategy) {
	return function (cp, needFeature) { return strategy(next, cp, needFeature); };
}, null);

UChar.isHighSurrogate = function (cp) { return cp >= 0xD800 && cp <= 0xDBFF; };
UChar.isLowSurrogate = function (cp) { return cp >= 0xDC00 && cp <= 0xDFFF; };

UChar.prototype.prepFeature = function () {
	if (!this.feature) {
		this.feature = UChar.fromCharCode(this.codepoint, true).feature;
	}
};

UChar.prototype.toString = function () {
	var x;
	if (this.codepoint < 0x10000) return String.fromCharCode(this.codepoint);
	x = this.codepoint - 0x10000;
	return String.fromCharCode(floor(x / 0x400) + 0xD800, x % 0x400 + 0xDC00);
};

UChar.prototype.getDecomp = function () {
	this.prepFeature();
	return this.feature[0] || null;
};

UChar.prototype.isCompatibility = function () {
	this.prepFeature();
	return !!this.feature[1] && (this.feature[1] & (1 << 8));
};
UChar.prototype.isExclude = function () {
	this.prepFeature();
	return !!this.feature[1] && (this.feature[1] & (1 << 9));
};
UChar.prototype.getCanonicalClass = function () {
	this.prepFeature();
	return !!this.feature[1] ? (this.feature[1] & 0xff) : 0;
};
UChar.prototype.getComposite = function (following) {
	var cp;
	this.prepFeature();
	if (!this.feature[2]) return null;
	cp = this.feature[2][following.codepoint];
	return cp ? UChar.fromCharCode(cp) : null;
};

UCharIterator = function (str) {
	this.str = str;
	this.cursor = 0;
};
UCharIterator.prototype.next = function () {
	if (!!this.str && this.cursor < this.str.length) {
		var cp = this.str.charCodeAt(this.cursor++), d;
		if (UChar.isHighSurrogate(cp) && this.cursor < this.str.length &&
				UChar.isLowSurrogate((d = this.str.charCodeAt(this.cursor)))) {
			cp = (cp - 0xD800) * 0x400 + (d - 0xDC00) + 0x10000;
			++this.cursor;
		}
		return UChar.fromCharCode(cp);
	}
	this.str = null;
	return null;
};

RecursDecompIterator = function (it, cano) {
	this.it = it;
	this.canonical = cano;
	this.resBuf = [];
};

RecursDecompIterator.prototype.next = function () {
	var recursiveDecomp, uchar;
	recursiveDecomp = function (cano, uchar) {
		var decomp = uchar.getDecomp(), ret, i, a, j;
		if (!!decomp && !(cano && uchar.isCompatibility())) {
			ret = [];
			for (i = 0; i < decomp.length; ++i) {
				a = recursiveDecomp(cano, UChar.fromCharCode(decomp[i]));
				//ret.concat(a); //<-why does not this work?
				//following block is a workaround.
				for (j = 0; j < a.length; ++j) ret.push(a[j]);
			}
			return ret;
		}
		return [uchar];
	};
	if (this.resBuf.length === 0) {
		uchar = this.it.next();
		if (!uchar) return null;
		this.resBuf = recursiveDecomp(this.canonical, uchar);
	}
	return this.resBuf.shift();
};

DecompIterator = function (it) {
	this.it = it;
	this.resBuf = [];
};

DecompIterator.prototype.next = function () {
	var cc, uchar, inspt, uchar2, cc2;
	if (this.resBuf.length === 0) {
		do {
			uchar = this.it.next();
			if (!uchar) break;
			cc = uchar.getCanonicalClass();
			inspt = this.resBuf.length;
			if (cc !== 0) {
				for (inspt; inspt > 0; --inspt) {
					uchar2 = this.resBuf[inspt - 1];
					cc2 = uchar2.getCanonicalClass();
					if (cc2 <= cc) break;
				}
			}
			this.resBuf.splice(inspt, 0, uchar);
		} while (cc !== 0);
	}
	return this.resBuf.shift();
};

CompIterator = function (it) {
	this.it = it;
	this.procBuf = [];
	this.resBuf = [];
	this.lastClass = null;
};

CompIterator.prototype.next = function () {
	var uchar, starter, composite, cc;
	while (this.resBuf.length === 0) {
		uchar = this.it.next();
		if (!uchar) {
			this.resBuf = this.procBuf;
			this.procBuf = [];
			break;
		}
		if (this.procBuf.length === 0) {
			this.lastClass = uchar.getCanonicalClass();
			this.procBuf.push(uchar);
		} else {
			starter = this.procBuf[0];
			composite = starter.getComposite(uchar);
			cc = uchar.getCanonicalClass();
			if (!!composite && (this.lastClass < cc || this.lastClass === 0)) {
				this.procBuf[0] = composite;
			} else {
				if (cc === 0) {
					this.resBuf = this.procBuf;
					this.procBuf = [];
				}
				this.lastClass = cc;
				this.procBuf.push(uchar);
			}
		}
	}
	return this.resBuf.shift();
};

createIterator = function (mode, str) {
	switch (mode) {
	case "NFD":
		return new DecompIterator(
			new RecursDecompIterator(new UCharIterator(str), true)
		);
	case "NFKD":
		return new DecompIterator(
			new RecursDecompIterator(new UCharIterator(str), false)
		);
	case "NFC":
		return new CompIterator(new DecompIterator(
			new RecursDecompIterator(new UCharIterator(str), true)
		));
	case "NFKC":
		return new CompIterator(new DecompIterator(
			new RecursDecompIterator(new UCharIterator(str), false)
		));
	}
	throw mode + " is invalid";
};
normalize = function (mode, str) {
	var it = createIterator(mode, str), ret = "", uchar;
	while (!!(uchar = it.next())) ret += uchar.toString();
	return ret;
};

/* Unicode data */
UChar.udata =  data;

module.exports = function (/*form*/) {
	var str = String(validValue(this)), form = arguments[0];
	if (form === undefined) form = 'NFC';
	else form = String(form);
	if (!forms[form]) throw new RangeError('Invalid normalization form: ' + form);
	return normalize(form, str);
};