forked from izaya/LuPPC
Add unmodified utf8.lua library
This commit is contained in:
parent
15a8f03f67
commit
5d456c09ba
317
src/lua/core/utf8/utf8.lua
Normal file
317
src/lua/core/utf8/utf8.lua
Normal file
@ -0,0 +1,317 @@
|
|||||||
|
-- $Id: utf8.lua 179 2009-04-03 18:10:03Z pasta $
|
||||||
|
--
|
||||||
|
-- Provides UTF-8 aware string functions implemented in pure lua:
|
||||||
|
-- * string.utf8len(s)
|
||||||
|
-- * string.utf8sub(s, i, j)
|
||||||
|
-- * string.utf8reverse(s)
|
||||||
|
--
|
||||||
|
-- If utf8data.lua (containing the lower<->upper case mappings) is loaded, these
|
||||||
|
-- additional functions are available:
|
||||||
|
-- * string.utf8upper(s)
|
||||||
|
-- * string.utf8lower(s)
|
||||||
|
--
|
||||||
|
-- All functions behave as their non UTF-8 aware counterparts with the exception
|
||||||
|
-- that UTF-8 characters are used instead of bytes for all units.
|
||||||
|
|
||||||
|
--[[
|
||||||
|
Copyright (c) 2006-2007, Kyle Smith
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright notice,
|
||||||
|
this list of conditions and the following disclaimer.
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
* Neither the name of the author nor the names of its contributors may be
|
||||||
|
used to endorse or promote products derived from this software without
|
||||||
|
specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
|
||||||
|
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
--]]
|
||||||
|
|
||||||
|
-- ABNF from RFC 3629
|
||||||
|
--
|
||||||
|
-- UTF8-octets = *( UTF8-char )
|
||||||
|
-- UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
|
||||||
|
-- UTF8-1 = %x00-7F
|
||||||
|
-- UTF8-2 = %xC2-DF UTF8-tail
|
||||||
|
-- UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
|
||||||
|
-- %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
|
||||||
|
-- UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
|
||||||
|
-- %xF4 %x80-8F 2( UTF8-tail )
|
||||||
|
-- UTF8-tail = %x80-BF
|
||||||
|
--
|
||||||
|
|
||||||
|
local strbyte, strlen, strsub, type = string.byte, string.len, string.sub, type
|
||||||
|
|
||||||
|
-- returns the number of bytes used by the UTF-8 character at byte i in s
|
||||||
|
-- also doubles as a UTF-8 character validator
|
||||||
|
local function utf8charbytes(s, i)
|
||||||
|
-- argument defaults
|
||||||
|
i = i or 1
|
||||||
|
|
||||||
|
-- argument checking
|
||||||
|
if type(s) ~= "string" then
|
||||||
|
error("bad argument #1 to 'utf8charbytes' (string expected, got ".. type(s).. ")")
|
||||||
|
end
|
||||||
|
if type(i) ~= "number" then
|
||||||
|
error("bad argument #2 to 'utf8charbytes' (number expected, got ".. type(i).. ")")
|
||||||
|
end
|
||||||
|
|
||||||
|
local c = strbyte(s, i)
|
||||||
|
|
||||||
|
-- determine bytes needed for character, based on RFC 3629
|
||||||
|
-- validate byte 1
|
||||||
|
if c > 0 and c <= 127 then
|
||||||
|
-- UTF8-1
|
||||||
|
return 1
|
||||||
|
|
||||||
|
elseif c >= 194 and c <= 223 then
|
||||||
|
-- UTF8-2
|
||||||
|
local c2 = strbyte(s, i + 1)
|
||||||
|
|
||||||
|
if not c2 then
|
||||||
|
error("UTF-8 string terminated early")
|
||||||
|
end
|
||||||
|
|
||||||
|
-- validate byte 2
|
||||||
|
if c2 < 128 or c2 > 191 then
|
||||||
|
error("Invalid UTF-8 character")
|
||||||
|
end
|
||||||
|
|
||||||
|
return 2
|
||||||
|
|
||||||
|
elseif c >= 224 and c <= 239 then
|
||||||
|
-- UTF8-3
|
||||||
|
local c2 = strbyte(s, i + 1)
|
||||||
|
local c3 = strbyte(s, i + 2)
|
||||||
|
|
||||||
|
if not c2 or not c3 then
|
||||||
|
error("UTF-8 string terminated early")
|
||||||
|
end
|
||||||
|
|
||||||
|
-- validate byte 2
|
||||||
|
if c == 224 and (c2 < 160 or c2 > 191) then
|
||||||
|
error("Invalid UTF-8 character")
|
||||||
|
elseif c == 237 and (c2 < 128 or c2 > 159) then
|
||||||
|
error("Invalid UTF-8 character")
|
||||||
|
elseif c2 < 128 or c2 > 191 then
|
||||||
|
error("Invalid UTF-8 character")
|
||||||
|
end
|
||||||
|
|
||||||
|
-- validate byte 3
|
||||||
|
if c3 < 128 or c3 > 191 then
|
||||||
|
error("Invalid UTF-8 character")
|
||||||
|
end
|
||||||
|
|
||||||
|
return 3
|
||||||
|
|
||||||
|
elseif c >= 240 and c <= 244 then
|
||||||
|
-- UTF8-4
|
||||||
|
local c2 = strbyte(s, i + 1)
|
||||||
|
local c3 = strbyte(s, i + 2)
|
||||||
|
local c4 = strbyte(s, i + 3)
|
||||||
|
|
||||||
|
if not c2 or not c3 or not c4 then
|
||||||
|
error("UTF-8 string terminated early")
|
||||||
|
end
|
||||||
|
|
||||||
|
-- validate byte 2
|
||||||
|
if c == 240 and (c2 < 144 or c2 > 191) then
|
||||||
|
error("Invalid UTF-8 character")
|
||||||
|
elseif c == 244 and (c2 < 128 or c2 > 143) then
|
||||||
|
error("Invalid UTF-8 character")
|
||||||
|
elseif c2 < 128 or c2 > 191 then
|
||||||
|
error("Invalid UTF-8 character")
|
||||||
|
end
|
||||||
|
|
||||||
|
-- validate byte 3
|
||||||
|
if c3 < 128 or c3 > 191 then
|
||||||
|
error("Invalid UTF-8 character")
|
||||||
|
end
|
||||||
|
|
||||||
|
-- validate byte 4
|
||||||
|
if c4 < 128 or c4 > 191 then
|
||||||
|
error("Invalid UTF-8 character")
|
||||||
|
end
|
||||||
|
|
||||||
|
return 4
|
||||||
|
|
||||||
|
else
|
||||||
|
error("Invalid UTF-8 character")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
-- returns the number of characters in a UTF-8 string
|
||||||
|
local function utf8len(s)
|
||||||
|
-- argument checking
|
||||||
|
if type(s) ~= "string" then
|
||||||
|
error("bad argument #1 to 'utf8len' (string expected, got ".. type(s).. ")")
|
||||||
|
end
|
||||||
|
|
||||||
|
local pos = 1
|
||||||
|
local bytes = strlen(s)
|
||||||
|
local len = 0
|
||||||
|
|
||||||
|
while pos <= bytes do
|
||||||
|
len = len + 1
|
||||||
|
pos = pos + utf8charbytes(s, pos)
|
||||||
|
end
|
||||||
|
|
||||||
|
return len
|
||||||
|
end
|
||||||
|
|
||||||
|
-- install in the string library
|
||||||
|
if not string.utf8len then
|
||||||
|
string.utf8len = utf8len
|
||||||
|
end
|
||||||
|
|
||||||
|
-- functions identically to string.sub except that i and j are UTF-8 characters
|
||||||
|
-- instead of bytes
|
||||||
|
local function utf8sub(s, i, j)
|
||||||
|
-- argument defaults
|
||||||
|
j = j or -1
|
||||||
|
|
||||||
|
-- argument checking
|
||||||
|
if type(s) ~= "string" then
|
||||||
|
error("bad argument #1 to 'utf8sub' (string expected, got ".. type(s).. ")")
|
||||||
|
end
|
||||||
|
if type(i) ~= "number" then
|
||||||
|
error("bad argument #2 to 'utf8sub' (number expected, got ".. type(i).. ")")
|
||||||
|
end
|
||||||
|
if type(j) ~= "number" then
|
||||||
|
error("bad argument #3 to 'utf8sub' (number expected, got ".. type(j).. ")")
|
||||||
|
end
|
||||||
|
|
||||||
|
local pos = 1
|
||||||
|
local bytes = strlen(s)
|
||||||
|
local len = 0
|
||||||
|
|
||||||
|
-- only set l if i or j is negative
|
||||||
|
local l = (i >= 0 and j >= 0) or utf8len(s)
|
||||||
|
local startChar = (i >= 0) and i or l + i + 1
|
||||||
|
local endChar = (j >= 0) and j or l + j + 1
|
||||||
|
|
||||||
|
-- can't have start before end!
|
||||||
|
if startChar > endChar then
|
||||||
|
return ""
|
||||||
|
end
|
||||||
|
|
||||||
|
-- byte offsets to pass to string.sub
|
||||||
|
local startByte, endByte = 1, bytes
|
||||||
|
|
||||||
|
while pos <= bytes do
|
||||||
|
len = len + 1
|
||||||
|
|
||||||
|
if len == startChar then
|
||||||
|
startByte = pos
|
||||||
|
end
|
||||||
|
|
||||||
|
pos = pos + utf8charbytes(s, pos)
|
||||||
|
|
||||||
|
if len == endChar then
|
||||||
|
endByte = pos - 1
|
||||||
|
break
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
return strsub(s, startByte, endByte)
|
||||||
|
end
|
||||||
|
|
||||||
|
-- install in the string library
|
||||||
|
if not string.utf8sub then
|
||||||
|
string.utf8sub = utf8sub
|
||||||
|
end
|
||||||
|
|
||||||
|
-- replace UTF-8 characters based on a mapping table
|
||||||
|
local function utf8replace(s, mapping)
|
||||||
|
-- argument checking
|
||||||
|
if type(s) ~= "string" then
|
||||||
|
error("bad argument #1 to 'utf8replace' (string expected, got ".. type(s).. ")")
|
||||||
|
end
|
||||||
|
if type(mapping) ~= "table" then
|
||||||
|
error("bad argument #2 to 'utf8replace' (table expected, got ".. type(mapping).. ")")
|
||||||
|
end
|
||||||
|
|
||||||
|
local pos = 1
|
||||||
|
local bytes = strlen(s)
|
||||||
|
local charbytes
|
||||||
|
local newstr = ""
|
||||||
|
|
||||||
|
while pos <= bytes do
|
||||||
|
charbytes = utf8charbytes(s, pos)
|
||||||
|
local c = strsub(s, pos, pos + charbytes - 1)
|
||||||
|
|
||||||
|
newstr = newstr .. (mapping[c] or c)
|
||||||
|
|
||||||
|
pos = pos + charbytes
|
||||||
|
end
|
||||||
|
|
||||||
|
return newstr
|
||||||
|
end
|
||||||
|
|
||||||
|
-- identical to string.upper except it knows about unicode simple case conversions
|
||||||
|
local function utf8upper(s)
|
||||||
|
return utf8replace(s, utf8_lc_uc)
|
||||||
|
end
|
||||||
|
|
||||||
|
-- install in the string library
|
||||||
|
if not string.utf8upper and utf8_lc_uc then
|
||||||
|
string.utf8upper = utf8upper
|
||||||
|
end
|
||||||
|
|
||||||
|
-- identical to string.lower except it knows about unicode simple case conversions
|
||||||
|
local function utf8lower(s)
|
||||||
|
return utf8replace(s, utf8_uc_lc)
|
||||||
|
end
|
||||||
|
|
||||||
|
-- install in the string library
|
||||||
|
if not string.utf8lower and utf8_uc_lc then
|
||||||
|
string.utf8lower = utf8lower
|
||||||
|
end
|
||||||
|
|
||||||
|
-- identical to string.reverse except that it supports UTF-8
|
||||||
|
local function utf8reverse(s)
|
||||||
|
-- argument checking
|
||||||
|
if type(s) ~= "string" then
|
||||||
|
error("bad argument #1 to 'utf8reverse' (string expected, got ".. type(s).. ")")
|
||||||
|
end
|
||||||
|
|
||||||
|
local bytes = strlen(s)
|
||||||
|
local pos = bytes
|
||||||
|
local charbytes
|
||||||
|
local newstr = ""
|
||||||
|
|
||||||
|
while pos > 0 do
|
||||||
|
c = strbyte(s, pos)
|
||||||
|
while c >= 128 and c <= 191 do
|
||||||
|
pos = pos - 1
|
||||||
|
c = strbyte(pos)
|
||||||
|
end
|
||||||
|
|
||||||
|
charbytes = utf8charbytes(s, pos)
|
||||||
|
|
||||||
|
newstr = newstr .. strsub(s, pos, pos + charbytes - 1)
|
||||||
|
|
||||||
|
pos = pos - 1
|
||||||
|
end
|
||||||
|
|
||||||
|
return newstr
|
||||||
|
end
|
||||||
|
|
||||||
|
-- install in the string library
|
||||||
|
if not string.utf8reverse then
|
||||||
|
string.utf8reverse = utf8reverse
|
||||||
|
end
|
1860
src/lua/core/utf8/utf8data.lua
Normal file
1860
src/lua/core/utf8/utf8data.lua
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user