Merge pull request #3 from asiekierka/master

utf8 fixes
2016-01-19 20:18:53 +01:00 · 2016-01-19 20:18:53 +01:00 · 801eee2880
commit 801eee2880
parent e9e82243f2 494c22c408
9 changed files with 98 additions and 2211 deletions
--- a/include/luares.h
+++ b/include/luares.h
@ -8,8 +8,6 @@ extern char lua_filesystem[];
 extern char lua_init[];
 extern char lua_sandbox[];
 extern char lua_textgpu[];
-extern char lua_utf8_utf8data[];
-extern char lua_utf8_utf8[];
 extern char lua_util_color[];
 extern char lua_util_random[];
 #endif
--- a/src/c/lnative.c
+++ b/src/c/lnative.c
@ -15,6 +15,7 @@
 #include <fcntl.h>
 #include <dirent.h>
 #include <ftw.h>
+#include <wchar.h>
 #include <limits.h>
 #include <linux/kd.h>

@ -350,6 +351,21 @@ static int l_pull (lua_State *L) {
  return 1;
 }

+static int l_wcwidth (lua_State *L) {
+  lua_pushnumber(L, wcwidth(lua_tonumber(L, 1)));
+  return 1;
+}
+
+static int l_towupper (lua_State *L) {
+  lua_pushnumber(L, towupper(lua_tonumber(L, 1)));
+  return 1;
+}
+
+static int l_towlower (lua_State *L) {
+  lua_pushnumber(L, towlower(lua_tonumber(L, 1)));
+  return 1;
+}
+
 void luanative_start(lua_State *L) {
  lua_createtable (L, 0, 1);
  
@ -371,6 +387,10 @@ void luanative_start(lua_State *L) {
  pushctuple(L, "fs_size", l_fs_size);
  pushctuple(L, "fs_read", l_fs_read);

+  pushctuple(L, "wcwidth", l_wcwidth);
+  pushctuple(L, "towlower", l_towlower);
+  pushctuple(L, "towupper", l_towupper);
+
  pushctuple(L, "beep", l_beep);
  pushctuple(L, "uptime", l_uptime);
  pushctuple(L, "totalMemory", l_totalMemory);
--- a/src/c/modules.c
+++ b/src/c/modules.c
@ -18,8 +18,6 @@ void setup_modules(lua_State *L) {
  pushstuple(L, "textgpu", lua_textgpu);
  pushstuple(L, "color", lua_util_color);
  pushstuple(L, "random", lua_util_random);
-  pushstuple(L, "utf8data", lua_utf8_utf8data);
-  pushstuple(L, "utf8", lua_utf8_utf8);

  pushstuple(L, "eepromDefault", res_eepromDefault);

--- a/src/lua/core/init.lua
+++ b/src/lua/core/init.lua
@ -43,8 +43,6 @@ function main()
  --Utils
  loadModule("random")
  loadModule("color")
-  loadModule("utf8data")
-  loadModule("utf8")

  modules.address = modules.random.uuid() --TODO: PREALPHA: Make constant

--- a/src/lua/core/sandbox.lua
+++ b/src/lua/core/sandbox.lua
@ -1,4 +1,14 @@
 local sandbox
+
+local l_wlen = function(text)
+      checkArg(1, text, "string")
+      local i = 0
+      for p, c in utf8.codes(text) do
+        i = i + native.wcwidth(c)
+      end
+      return utf8.len(text)
+    end
+
 sandbox = {
  assert = assert,
  dofile = nil,
@ -163,20 +173,72 @@ sandbox = {
  unicode = {
    char = utf8.char,
    charWidth = function(c)
-      checkArg(1, c, "string")
-      return modules.utf8.charbytes(c)
+      return l_wlen(c)
    end,
    isWide = function(c)
-      checkArg(1, c, "string")
-      return modules.utf8.charbytes(c) > 1
+      return l_wlen(c) > 1
    end,
    len = utf8.len,
-    lower = modules.utf8.lower,
-    reverse = modules.utf8.reverse,
-    sub = modules.utf8.sub,
-    upper = modules.utf8.upper,
-    wlen = utf8.len, --How is it different from len?
-    --wtrunc?
+    lower = function(text)
+      checkArg(1, text, "string")
+      local s = ""
+      for p, c in utf8.codes(text) do
+        s = s .. utf8.char(native.towlower(c))
+      end
+      return s
+    end,
+    upper = function(text)
+      checkArg(1, text, "string")
+      local s = ""
+      for p, c in utf8.codes(text) do
+        s = s .. utf8.char(native.towupper(c))
+      end
+      return s
+    end,
+    reverse = function(text)
+      checkArg(1, text, "string")
+      local s = ""
+      for p, c in utf8.codes(text) do
+        s = utf8.char(c) .. s
+      end
+      return s
+    end,
+    sub = function(s, i, j)
+      checkArg(1, s, "string")
+      i = i or 1
+      j = j or math.maxinteger
+      if i<1 or j<1 then
+        local n = utf8.len(s)
+        if not n then return nil end
+        if i<0 then i = n+1+i end
+        if j<0 then j = n+1+j end
+        if i<0 then i = 1 elseif i>n then i = n end
+        if j<0 then j = 1 elseif j>n then j = n end
+      end
+      if j<i then return "" end
+      i = utf8.offset(s,i) or math.maxinteger
+      j = utf8.offset(s,j+1) or math.maxinteger
+      if i and j then return s:sub(i,j-1)
+        elseif i then return s:sub(i)
+        else return ""
+      end
+    end,
+    wlen = l_wlen,
+    wtrunc = function(s, l)
+      checkArg(1, s, "string")
+      checkArg(2, l, "number")
+      local width = 0
+      local text = ""
+      for p, c in utf8.codes(s) do
+        if width < l then
+          width = width + native.wcwidth(c)
+          if width < l then
+            text = text .. utf8.char(c)
+          end
+        end
+      end
+      return text
+    end,
  },
  checkArg = checkArg,
  og = _G
--- a/src/lua/core/textgpu.lua
+++ b/src/lua/core/textgpu.lua
@ -39,7 +39,7 @@ local function prepareBuffers(w, h)
  end
 end

-local usub = modules.utf8.sub
+local usub = modules.sandbox.utf8.sub
 local function insertString(main, sub, at)
  return usub(main, 1, at - 1) .. sub .. usub(main, at + utf8.len(sub))
 end
--- a/src/lua/core/utf8/utf8.lua
+++ b/src/lua/core/utf8/utf8.lua
@ -1,328 +0,0 @@
-- $Id: utf8.lua 179 2009-04-03 18:10:03Z pasta $
--
-- Provides UTF-8 aware string functions implemented in pure lua:
-- * string.utf8len(s)
-- * string.utf8sub(s, i, j)
-- * string.utf8reverse(s)
--
-- If utf8data.lua (containing the lower<->upper case mappings) is loaded, these
-- additional functions are available:
-- * string.utf8upper(s)
-- * string.utf8lower(s)
--
-- All functions behave as their non UTF-8 aware counterparts with the exception
-- that UTF-8 characters are used instead of bytes for all units.
-
--[[
-Copyright (c) 2006-2007, Kyle Smith
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-		* Redistributions of source code must retain the above copyright notice,
-			this list of conditions and the following disclaimer.
-		* Redistributions in binary form must reproduce the above copyright
-			notice, this list of conditions and the following disclaimer in the
-			documentation and/or other materials provided with the distribution.
-		* Neither the name of the author nor the names of its contributors may be
-			used to endorse or promote products derived from this software without
-			specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--]]
-
-- ABNF from RFC 3629
--
-- UTF8-octets = *( UTF8-char )
-- UTF8-char   = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
-- UTF8-1      = %x00-7F
-- UTF8-2      = %xC2-DF UTF8-tail
-- UTF8-3      = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
--               %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
-- UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
--               %xF4 %x80-8F 2( UTF8-tail )
-- UTF8-tail   = %x80-BF
--
-
-local strbyte, strlen, strsub, type = string.byte, string.len, string.sub, type
-local lutf8 = utf8
-local utf8 = {}
-
-- returns the number of bytes used by the UTF-8 character at byte i in s
-- also doubles as a UTF-8 character validator
-local function utf8charbytes(s, i)
-	-- argument defaults
-	i = i or 1
-
-	-- argument checking
-	if type(s) ~= "string" then
-		error("bad argument #1 to 'utf8charbytes' (string expected, got ".. type(s).. ")")
-	end
-	if type(i) ~= "number" then
-		error("bad argument #2 to 'utf8charbytes' (number expected, got ".. type(i).. ")")
-	end
-
-	local c = strbyte(s, i)
-
-	-- determine bytes needed for character, based on RFC 3629
-	-- validate byte 1
-	if c > 0 and c <= 127 then
-		-- UTF8-1
-		return 1
-
-	elseif c >= 194 and c <= 223 then
-		-- UTF8-2
-		local c2 = strbyte(s, i + 1)
-
-		if not c2 then
-			error("UTF-8 string terminated early")
-		end
-
-		-- validate byte 2
-		if c2 < 128 or c2 > 191 then
-			error("Invalid UTF-8 character")
-		end
-
-		return 2
-
-	elseif c >= 224 and c <= 239 then
-		-- UTF8-3
-		local c2 = strbyte(s, i + 1)
-		local c3 = strbyte(s, i + 2)
-
-		if not c2 or not c3 then
-			error("UTF-8 string terminated early")
-		end
-
-		-- validate byte 2
-		if c == 224 and (c2 < 160 or c2 > 191) then
-			error("Invalid UTF-8 character")
-		elseif c == 237 and (c2 < 128 or c2 > 159) then
-			error("Invalid UTF-8 character")
-		elseif c2 < 128 or c2 > 191 then
-			error("Invalid UTF-8 character")
-		end
-
-		-- validate byte 3
-		if c3 < 128 or c3 > 191 then
-			error("Invalid UTF-8 character")
-		end
-
-		return 3
-
-	elseif c >= 240 and c <= 244 then
-		-- UTF8-4
-		local c2 = strbyte(s, i + 1)
-		local c3 = strbyte(s, i + 2)
-		local c4 = strbyte(s, i + 3)
-
-		if not c2 or not c3 or not c4 then
-			error("UTF-8 string terminated early")
-		end
-
-		-- validate byte 2
-		if c == 240 and (c2 < 144 or c2 > 191) then
-			error("Invalid UTF-8 character")
-		elseif c == 244 and (c2 < 128 or c2 > 143) then
-			error("Invalid UTF-8 character")
-		elseif c2 < 128 or c2 > 191 then
-			error("Invalid UTF-8 character")
-		end
-
-		-- validate byte 3
-		if c3 < 128 or c3 > 191 then
-			error("Invalid UTF-8 character")
-		end
-
-		-- validate byte 4
-		if c4 < 128 or c4 > 191 then
-			error("Invalid UTF-8 character")
-		end
-
-		return 4
-
-	else
-		error("Invalid UTF-8 character")
-	end
-end
-
-utf8.charbytes = utf8charbytes
-
-- returns the number of characters in a UTF-8 string
-local function utf8len(s)
-	-- argument checking
-	if type(s) ~= "string" then
-		error("bad argument #1 to 'utf8len' (string expected, got ".. type(s).. ")")
-	end
-
-	local pos = 1
-	local bytes = strlen(s)
-	local len = 0
-
-	while pos <= bytes do
-		len = len + 1
-		pos = pos + utf8charbytes(s, pos)
-	end
-
-	return len
-end
-
-utf8.len = utf8len
-
-- functions identically to string.sub except that i and j are UTF-8 characters
-- instead of bytes
-local function utf8sub(s, i, j)
-	-- argument defaults
-	j = j or -1
-
-	-- argument checking
-	if type(s) ~= "string" then
-		error("bad argument #1 to 'utf8sub' (string expected, got ".. type(s).. ")")
-	end
-	if type(i) ~= "number" then
-		error("bad argument #2 to 'utf8sub' (number expected, got ".. type(i).. ")")
-	end
-	if type(j) ~= "number" then
-		error("bad argument #3 to 'utf8sub' (number expected, got ".. type(j).. ")")
-	end
-
-	local pos = 1
-	local bytes = strlen(s)
-	local len = 0
-
-	-- only set l if i or j is negative
-	local l = (i >= 0 and j >= 0) or utf8len(s)
-	local startChar = (i >= 0) and i or l + i + 1
-	local endChar   = (j >= 0) and j or l + j + 1
-
-	-- can't have start before end!
-	if startChar > endChar then
-		return ""
-	end
-
-	-- byte offsets to pass to string.sub
-	local startByte, endByte = 1, bytes
-
-	while pos <= bytes do
-		len = len + 1
-
-		if len == startChar then
-			startByte = pos
-		end
-
-		pos = pos + utf8charbytes(s, pos)
-
-		if len == endChar then
-			endByte = pos - 1
-			break
-		end
-	end
-
-	return strsub(s, startByte, endByte)
-end
-
-function utf8.sub(s,i,j)
-  i = i or 1
-  j = j or math.maxinteger
-  if i<1 or j<1 then
-  	local n = lutf8.len(s)
-  	if not n then return nil end
-  	if i<0 then i = n+1+i end
-  	if j<0 then j = n+1+j end
-  	if i<0 then i = 1 elseif i>n then i = n end
-  	if j<0 then j = 1 elseif j>n then j = n end
-  end
-  if j<i then return "" end
-  i = lutf8.offset(s,i) or math.maxinteger
-  j = lutf8.offset(s,j+1) or math.maxinteger
-  if i and j then return s:sub(i,j-1)
-  	elseif i then return s:sub(i)
-  	else return ""
-  end
-end
-
--utf8.sub = utf8sub
-
-- replace UTF-8 characters based on a mapping table
-local function utf8replace(s, mapping)
-	-- argument checking
-	if type(s) ~= "string" then
-		error("bad argument #1 to 'utf8replace' (string expected, got ".. type(s).. ")")
-	end
-	if type(mapping) ~= "table" then
-		error("bad argument #2 to 'utf8replace' (table expected, got ".. type(mapping).. ")")
-	end
-
-	local pos = 1
-	local bytes = strlen(s)
-	local charbytes
-	local newstr = ""
-
-	while pos <= bytes do
-		charbytes = utf8charbytes(s, pos)
-		local c = strsub(s, pos, pos + charbytes - 1)
-
-		newstr = newstr .. (mapping[c] or c)
-
-		pos = pos + charbytes
-	end
-
-	return newstr
-end
-
-- identical to string.upper except it knows about unicode simple case conversions
-local function utf8upper(s)
-	return utf8replace(s, modules.utf8data.lc_uc)
-end
-
-utf8.upper = utf8upper
-
-- identical to string.lower except it knows about unicode simple case conversions
-local function utf8lower(s)
-	return utf8replace(s, modules.utf8data.uc_lc)
-end
-
-utf8.lower = utf8lower
-
-- identical to string.reverse except that it supports UTF-8
-local function utf8reverse(s)
-	-- argument checking
-	if type(s) ~= "string" then
-		error("bad argument #1 to 'utf8reverse' (string expected, got ".. type(s).. ")")
-	end
-
-	local bytes = strlen(s)
-	local pos = bytes
-	local charbytes
-	local newstr = ""
-
-	while pos > 0 do
-		c = strbyte(s, pos)
-		while c >= 128 and c <= 191 do
-			pos = pos - 1
-			c = strbyte(pos)
-		end
-
-		charbytes = utf8charbytes(s, pos)
-
-		newstr = newstr .. strsub(s, pos, pos + charbytes - 1)
-
-		pos = pos - 1
-	end
-
-	return newstr
-end
-
-utf8.reverse = utf8reverse
-
-return utf8
--- a/src/lua/core/utf8/utf8data.lua
+++ b/src/lua/core/utf8/utf8data.lua