hammerspoon/extensions/utf8/utf8.lua

380 lines
18 KiB
Lua
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

--- === hs.utf8 ===
---
--- Functions providing basic support for UTF-8 encodings
---
--- Prior to upgrading Hammerspoon's Lua interpreter to 5.3, UTF8 support was provided by including the then beta version of Lua 5.3's utf8 library as a Hammerspoon module. This is no longer necessary, but to maintain compatibility, the Lua utf8 library can still be accessed through `hs.utf8`. The documentation for the utf8 library can be found at http://www.lua.org/manual/5.3/ or from the Hammerspoon console via the help command: `help.lua.utf8`. This affects the following functions and variables:
---
--- * hs.utf8.char - help available via `help.lua.utf8.char`
--- * hs.utf8.charPattern - help available via `help.lua.utf8.charpattern`
--- * hs.utf8.codepoint - help available via `help.lua.utf8.codepoint`
--- * hs.utf8.codes - help available via `help.lua.utf8.codes`
--- * hs.utf8.len - help available via `help.lua.utf8.len`
--- * hs.utf8.offset - help available via `help.lua.utf8.offset`
---
--- Additional functions that are specific to Hammerspoon which provide expanded support for UTF8 are documented here.
---
-- Mirror utf8.X as hs.utf8.X in a case insensitive manner -- a little broader than
-- camelCase, but simpler then checking each individually for its "proper camel case"
-- version. -- edit: OK, so it's really only charPattern.. still, this is more portable
-- in case it's needed as a template for elsewhere.
local module = setmetatable({}, {
__index = function(_, key)
for i,v in pairs(package.loaded["utf8"]) do
if string.lower(key) == i then return v end
end
return nil
end
})
local fnutils = require("hs.fnutils")
-- Public interface ------------------------------------------------------
-- see below -- we need it defined for the registration function, but documentation will be
-- given where we add the predefined keys.
module.registeredKeys = setmetatable({}, { __tostring = function(object)
local output = ""
for i,v in fnutils.sortByKeys(object) do
output = output..string.format("(U+%04X) %-15s %s\n", utf8.codepoint(v), i, v)
end
return output
end,
__call = function(_,x) return _[x] end,
})
--- hs.utf8.registeredLabels(utf8char) -> string
--- Function
--- Returns the label name for a UTF8 character, as it is registered in `hs.utf8.registeredKeys[]`.
---
--- Parameters:
--- * utf8char -- the character to lookup in `hs.utf8.registeredKeys[]`
---
--- Returns:
--- * The string label for the UTF8 character or a string in the format of "U+XXXX", if it is not defined in `hs.utf8.registeredKeys[]`, or nil, if utf8char is not a valid UTF8 character.
---
--- Notes:
--- * For parity with `hs.utf8.registeredKeys`, this can also be invoked as if it were an array: i.e. `hs.utf8.registeredLabels(char)` is equivalent to `hs.utf8.registeredLabels[char]`
local realRegisteredLabels = function(_, x)
for i,v in pairs(module.registeredKeys) do
if v == x then
return i
end
end
if x:match("^"..utf8.charpattern.."$") then
return string.format("U+%04X",utf8.codepoint(x))
else
return nil
end
end
module.registeredLabels = setmetatable({}, { __index = realRegisteredLabels, __call = realRegisteredLabels })
--- hs.utf8.codepointToUTF8(...) -> string
--- Function
--- Wrapper to `utf8.char(...)` which ensures that all codepoints return valid UTF8 characters.
---
--- Parameters:
--- * codepoints - A series of numeric Unicode code points to be converted to a UTF-8 byte sequences. If a codepoint is a string (and does not start with U+, it is used as a key for lookup in `hs.utf8.registeredKeys[]`
---
--- Returns:
--- * A string containing the UTF-8 byte sequences corresponding to provided codepoints as a combined string.
---
--- Notes:
--- * Valid codepoint values are from 0x0000 - 0x10FFFF (0 - 1114111)
--- * If the codepoint provided is a string that starts with U+, then the 'U+' is converted to a '0x' so that lua can properly treat the value as numeric.
--- * Invalid codepoints are returned as the Unicode Replacement Character (U+FFFD)
--- * This includes out of range codepoints as well as the Unicode Surrogate codepoints (U+D800 - U+DFFF)
module.codepointToUTF8 = function(...)
local listOfChars = table.pack(...)
local result = ""
for _,codepoint in ipairs(listOfChars) do
if type(codepoint) == "string" then
if codepoint:match("^U%+") then
codepoint = codepoint:gsub("^U%+","0x")
else
codepoint = module.registeredKeys[codepoint] and utf8.codepoint(module.registeredKeys[codepoint]) or 0xFFFD
end
end
codepoint = tonumber(codepoint)
-- negatives not allowed
if codepoint < 0 then result = result..utf8.char(0xFFFD)
-- the surrogates cause print() to crash -- and they're invalid UTF-8 anyways
elseif codepoint >= 0xD800 and codepoint <=0xDFFF then result = result..utf8.char(0xFFFD)
-- single byte, 7-bit ascii
elseif codepoint < 0x80 then result = result..string.char(codepoint)
-- multibyte UTF8
elseif codepoint <= 0x10FFFF then result = result..utf8.char(codepoint)
-- greater than 0x10FFFF is invalid UTF-8
else result = result..utf8.char(0xFFFD)
end
end
return result
end
--- hs.utf8.fixUTF8(inString[, replacementChar]) -> outString, posTable
--- Function
--- Replace invalid UTF8 character sequences in `inString` with `replacementChar` so it can be safely displayed in the console or other destination which requires valid UTF8 encoding.
---
--- Parameters:
--- * inString - String of characters which may contain invalid UTF8 byte sequences
--- * replacementChar - optional parameter to replace invalid byte sequences in `inString`. If this parameter is not provided, the default UTF8 replacement character, U+FFFD, is used.
---
--- Returns:
--- * outString - The contents of `inString` with all invalid UTF8 byte sequences replaced by the `replacementChar`.
--- * posTable - a table of indexes in `outString` corresponding indicating where `replacementChar` has been used.
---
--- Notes:
--- * This function is a slight modification to code found at http://notebook.kulchenko.com/programming/fixing-malformed-utf8-in-lua.
--- * If `replacementChar` is a multi-byte character (like U+FFFD) or multi character string, then the string length of `outString` will be longer than the string length of `inString`. The character positions in `posTable` will reflect these new positions in `outString`.
--- * To calculate the character position of the invalid characters in `inString`, use something like the following:
---
--- outString, outErrors = hs.utf8.fixUTF8(inString, replacement)
--- inErrors = {}
--- for i,p in ipairs(outErrors) do
--- table.insert(inErrors, p - ((i - 1) * string.length(replacement) - 1))
--- end
---
--- Where replacement is `utf8.char(0xFFFD)`, if you leave it out of the `hs.utf8.fixUTF8` function in the first line.
---
module.fixUTF8 = function(s, replacement)
replacement = replacement or utf8.char(0xFFFD)
local p, len, invalid = 1, #s, {}
local offset = string.len(replacement) - 1
while p <= len do
if p == s:find("[%z\1-\127]", p) then p = p + 1
elseif p == s:find("[\194-\223][\128-\191]", p) then p = p + 2
elseif p == s:find( "\224[\160-\191][\128-\191]", p)
or p == s:find("[\225-\236][\128-\191][\128-\191]", p)
or p == s:find( "\237[\128-\159][\128-\191]", p)
or p == s:find("[\238-\239][\128-\191][\128-\191]", p) then p = p + 3
elseif p == s:find( "\240[\144-\191][\128-\191][\128-\191]", p)
or p == s:find("[\241-\243][\128-\191][\128-\191][\128-\191]", p)
or p == s:find( "\244[\128-\143][\128-\191][\128-\191]", p) then p = p + 4
else
s = s:sub(1, p-1)..replacement..s:sub(p+1)
len = len + offset
table.insert(invalid, p)
end
end
return s, invalid
end
--- hs.utf8.registerCodepoint(label, codepoint) -> string
--- Function
--- Registers a Unicode codepoint under the given label as a UTF-8 string of bytes which can be referenced by the label later in your code as `hs.utf8.registeredKeys[label]` for convenience and readability.
---
--- Parameters:
--- * label - a string label to use as a human-readable reference when getting the UTF-8 byte sequence for use in other strings and output functions.
--- * codepoint - a Unicode codepoint in numeric or `U+xxxx` format to register with the given label.
---
--- Returns:
--- * Returns the UTF-8 byte sequence for the Unicode codepoint registered.
---
--- Notes:
--- * If a codepoint label was previously registered, this will overwrite the previous value with a new one. Because many of the special keys you may want to register have different variants, this allows you to easily modify the existing predefined defaults to suite your preferences.
--- * The return value is merely syntactic sugar and you do not need to save it locally; it can be safely ignored -- future access to the pre-converted codepoint should be retrieved as `hs.utf8.registeredKeys[label]` in your code. It looks good when invoked from the console, though ☺.
module.registerCodepoint = function(label, codepoint)
module.registeredKeys[label] = module.codepointToUTF8(codepoint)
return module.registeredKeys[label]
end
--- hs.utf8.registeredKeys[]
--- Variable
--- A collection of UTF-8 characters already converted from codepoint and available as convient key-value pairs. UTF-8 printable versions of common Apple and OS X special keys are predefined and others can be added with `hs.utf8.registerCodepoint(label, codepoint)` for your own use.
---
--- Predefined keys include:
---
--- (U+2325) alt ⌥
--- (U+F8FF) apple 
--- (U+21E4) backtab ⇤
--- (U+21EA) capslock ⇪
--- (U+2713) checkMark ✓
--- (U+2318) cmd ⌘
--- (U+27E1) concaveDiamond ✧
--- (U+00A9) copyrightSign ©
--- (U+2303) ctrl ⌃
--- (U+232B) delete ⌫
--- (U+2193) down ↓
--- (U+21E3) down2 ⇣
--- (U+23CF) eject ⏏
--- (U+21F2) end ⇲
--- (U+2198) end2 ↘
--- (U+238B) escape ⎋
--- (U+2326) forwarddelete ⌦
--- (U+FE56) help ﹖
--- (U+21F1) home ⇱
--- (U+2196) home2 ↖
--- (U+21B8) home3 ↸
--- (U+2190) left ←
--- (U+21E0) left2 ⇠
--- (U+201C) leftDoubleQuote “
--- (U+2018) leftSingleQuote
--- (U+00B7) middleDot ·
--- (U+21ED) numlock ⇭
--- (U+2325) option ⌥
--- (U+2327) padclear ⌧
--- (U+2324) padenter ⌤
--- (U+2386) padenter2 ⎆
--- (U+21A9) padenter3 ↩
--- (U+21DF) pagedown ⇟
--- (U+21DE) pageup ⇞
--- (U+233D) power ⌽
--- (U+00AE) registeredSign ®
--- (U+23CE) return ⏎
--- (U+21A9) return2 ↩
--- (U+2192) right →
--- (U+21E2) right2 ⇢
--- (U+201D) rightDoubleQuote ”
--- (U+2019) rightSingleQuote
--- (U+00A7) sectionSign §
--- (U+21E7) shift ⇧
--- (U+2423) space ␣
--- (U+21E5) tab ⇥
--- (U+2191) up ↑
--- (U+21E1) up2 ⇡
---
--- Notes:
--- * This table has a __tostring() metamethod which allows listing it's contents in the Hammerspoon console by typing `hs.utf8.registeredKeys`.
--- * For parity with `hs.utf8.registeredLabels`, this can also invoked as a function, i.e. `hs.utf8.registeredKeys["cmd"]` is equivalent to `hs.utf8.registeredKeys("cmd")`
module.registerCodepoint("alt", 0x2325)
module.registerCodepoint("apple", 0xF8FF)
module.registerCodepoint("backtab", 0x21E4)
module.registerCodepoint("capslock", 0x21EA)
module.registerCodepoint("cmd", 0x2318)
module.registerCodepoint("ctrl", 0x2303)
module.registerCodepoint("delete", 0x232B)
module.registerCodepoint("down", 0x2193)
module.registerCodepoint("down2", 0x21E3)
module.registerCodepoint("eject", 0x23CF)
module.registerCodepoint("end", 0x21F2)
module.registerCodepoint("end2", 0x2198)
module.registerCodepoint("escape", 0x238B)
module.registerCodepoint("forwarddelete", 0x2326)
module.registerCodepoint("help", 0xFE56)
module.registerCodepoint("home", 0x21F1)
module.registerCodepoint("home2", 0x2196)
module.registerCodepoint("home3", 0x21B8)
module.registerCodepoint("left", 0x2190)
module.registerCodepoint("left2", 0x21E0)
module.registerCodepoint("numlock", 0x21ED)
module.registerCodepoint("option", 0x2325)
module.registerCodepoint("padclear", 0x2327)
module.registerCodepoint("padenter", 0x2324)
module.registerCodepoint("padenter2", 0x2386)
module.registerCodepoint("padenter3", 0x21A9)
module.registerCodepoint("pagedown", 0x21DF)
module.registerCodepoint("pageup", 0x21DE)
module.registerCodepoint("power", 0x233D)
module.registerCodepoint("return", 0x23CE)
module.registerCodepoint("return2", 0x21A9)
module.registerCodepoint("right", 0x2192)
module.registerCodepoint("right2", 0x21E2)
module.registerCodepoint("shift", 0x21E7)
module.registerCodepoint("space", 0x2423)
module.registerCodepoint("tab", 0x21E5)
module.registerCodepoint("up", 0x2191)
module.registerCodepoint("up2", 0x21E1)
module.registerCodepoint("middleDot", 0x00B7)
module.registerCodepoint("leftSingleQuote", 0x2018)
module.registerCodepoint("rightSingleQuote", 0x2019)
module.registerCodepoint("leftDoubleQuote", 0x201C)
module.registerCodepoint("rightDoubleQuote", 0x201D)
module.registerCodepoint("sectionSign", 0x00A7)
module.registerCodepoint("copyrightSign", 0x00A9)
module.registerCodepoint("registeredSign", 0x00AE)
module.registerCodepoint("checkMark", 0x2713)
module.registerCodepoint("concaveDiamond", 0x27E1)
--- hs.utf8.asciiOnly(string[, all]) -> string
--- Function
--- Returns the provided string with all non-printable ascii characters escaped, except Return, Linefeed, and Tab.
---
--- Parameters:
--- * string - The input string which is to have all non-printable ascii characters escaped as \x## (a single byte hexadecimal number).
--- * all - an optional boolean parameter (default false) indicating whether or not Return, Linefeed, and Tab should also be considered "non-printable"
---
--- Returns:
--- * The cleaned up string, with non-printable characters escaped.
---
--- Notes:
--- * Because Unicode characters outside of the basic ascii alphabet are multi-byte characters, any UTF8 or other Unicode encoded character will be broken up into their individual bytes and likely escaped by this function.
--- * This function is useful for displaying binary data in a human readable way that might otherwise be inexpressible in the Hammerspoon console or other destination. For example:
--- * `utf8.charpattern`, which contains the regular expression for matching valid UTF8 encoded sequences, results in `(null)` in the Hammerspoon console, but `hs.utf8.asciiOnly(utf8.charpattern)` will display `[\x00-\x7F\xC2-\xF4][\x80-\xBF]*`.
module.asciiOnly = function(theString, all)
all = all or false
if type(theString) == "string" then
if all then
return (theString:gsub("[\x00-\x1f\x7f-\xff]",function(a)
return string.format("\\x%02X",string.byte(a))
end))
else
return (theString:gsub("[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]",function(a)
return string.format("\\x%02X",string.byte(a))
end))
end
else
error("string expected", 2) ;
end
end
--- hs.utf8.hexDump(inputString [, count]) -> string
--- Function
--- Returns a hex dump of the provided string. This is primarily useful for examining the exact makeup of binary data contained in a Lua String as individual bytes for debugging purposes.
---
--- Parameters:
--- * inputString - the data to be rendered as individual hexadecimal bytes for examination.
--- * count - an optional parameter specifying the number of bytes to display per line (default 16)
---
--- Returns:
--- * a string containing the hex dump of the input string.
---
--- Notes:
--- * Like hs.utf8.asciiOnly, this function will break up Unicode characters into their individual bytes.
--- * As an example:
--- `hs.utf8.hexDump(utf8.charpattern)` will return
--- `00 : 5B 00 2D 7F C2 2D F4 5D 5B 80 2D BF 5D 2A : [.-..-.][.-.]*`
module.hexDump = function(stuff, linemax)
local ascii = ""
local count = 0
linemax = tonumber(linemax) or 16
local buffer = ""
local rb = ""
local offset = math.floor(math.log(#stuff,16)) + 1
offset = offset + (offset % 2)
local formatstr = "%0"..tostring(offset).."x : %-"..tostring(linemax * 3).."s : %s"
for c in string.gmatch(tostring(stuff), ".") do
buffer = buffer..string.format("%02X ",string.byte(c))
-- using string.gsub(c,"%c",".") didn't work in Hydra, but I didn't dig any deeper -- this works.
if string.byte(c) < 32 or string.byte(c) > 126 then
ascii = ascii.."."
else
ascii = ascii..c
end
count = count + 1
if count % linemax == 0 then
rb = rb .. string.format(formatstr, count - linemax, buffer, ascii) .. "\n"
buffer=""
ascii=""
end
end
if count % linemax ~= 0 then
rb = rb .. string.format(formatstr, count - (count % linemax), buffer, ascii) .. "\n"
end
return rb
end
-- Return Module Object --------------------------------------------------
return module