Improves support for non-ascii keys in a tree

Properly recognize and organize non-ascii keys into nodes, allowing
usage with entries in other languages.

With this change, it is possible to use 2 or 3 bytes wide characters
(Unicode) without issues:

    tree = Radix::Tree(Symbol).new
    tree.add "/", :root
    tree.add "/日本語", :japanese
    tree.add "/日本は難しい", :japanese_is_difficult

Which produces the following node hierarchy:

    # ( 1) /       (:root)
    # ( 6)  日本
    # (12)    は難しい (:japanese_is_difficult)
    # ( 3)    語    (:japanese)

And lookup works as expected:

    result = tree.find "/日本は難しい"
    puts result.found? # => true
This commit is contained in:
Luis Lavena 2017-02-18 20:30:05 -03:00
parent 7460033db3
commit 97ef407aec
3 changed files with 55 additions and 9 deletions

View file

@ -7,6 +7,7 @@ so please check *Changed* and *Removed* notes before upgrading.
## [Unreleased] ## [Unreleased]
### Fixed ### Fixed
- Correct lookup issue caused by incorrect comparison of shared key [#21](https://github.com/luislavena/radix/issues/21) - Correct lookup issue caused by incorrect comparison of shared key [#21](https://github.com/luislavena/radix/issues/21)
- Improve support for non-ascii keys in a tree.
## [0.3.7] - 2017-02-04 ## [0.3.7] - 2017-02-04
### Fixed ### Fixed

View file

@ -172,6 +172,38 @@ module Radix
end end
end end
context "dealing with unicode" do
it "inserts properly adjacent parent nodes" do
tree = Tree(Symbol).new
tree.add "/", :root
tree.add "/日本語", :japanese
tree.add "/素晴らしい", :amazing
# / (:root)
# +-素晴らしい (:amazing)
# \-日本語 (:japanese)
tree.root.children.size.should eq(2)
tree.root.children[0].key.should eq("素晴らしい")
tree.root.children[1].key.should eq("日本語")
end
it "inserts nodes with shared parent" do
tree = Tree(Symbol).new
tree.add "/", :root
tree.add "/日本語", :japanese
tree.add "/日本は難しい", :japanese_is_difficult
# / (:root)
# \-日本語 (:japanese)
# \-日本は難しい (:japanese_is_difficult)
tree.root.children.size.should eq(1)
tree.root.children[0].key.should eq("日本")
tree.root.children[0].children.size.should eq(2)
tree.root.children[0].children[0].key.should eq("は難しい")
tree.root.children[0].children[1].key.should eq("")
end
end
context "dealing with duplicates" do context "dealing with duplicates" do
it "does not allow same path be defined twice" do it "does not allow same path be defined twice" do
tree = Tree(Symbol).new tree = Tree(Symbol).new
@ -349,6 +381,19 @@ module Radix
end end
end end
context "unicode nodes with shared parent" do
it "finds matching path" do
tree = Tree(Symbol).new
tree.add "/", :root
tree.add "/日本語", :japanese
tree.add "/日本日本語は難しい", :japanese_is_difficult
result = tree.find("/日本日本語は難しい/")
result.found?.should be_true
result.key.should eq("/日本日本語は難しい")
end
end
context "dealing with catch all" do context "dealing with catch all" do
it "finds matching path" do it "finds matching path" do
tree = Tree(Symbol).new tree = Tree(Symbol).new

View file

@ -125,7 +125,7 @@ module Radix
# determine split point difference between path and key # determine split point difference between path and key
# compare if path is larger than key # compare if path is larger than key
if path_reader.pos == 0 || if path_reader.pos == 0 ||
(path_reader.pos < path.size && path_reader.pos >= node.key.size) (path_reader.pos < path.bytesize && path_reader.pos >= node.key.bytesize)
# determine if a child of this node contains the remaining part # determine if a child of this node contains the remaining part
# of the path # of the path
added = false added = false
@ -156,7 +156,7 @@ module Radix
# adjust priorities # adjust priorities
node.sort! node.sort!
elsif path_reader.pos == path.size && path_reader.pos == node.key.size elsif path_reader.pos == path.bytesize && path_reader.pos == node.key.bytesize
# determine if path matches key and potentially be a duplicate # determine if path matches key and potentially be a duplicate
# and raise if is the case # and raise if is the case
@ -166,7 +166,7 @@ module Radix
# assign payload since this is an empty node # assign payload since this is an empty node
node.payload = payload node.payload = payload
end end
elsif path_reader.pos > 0 && path_reader.pos < node.key.size elsif path_reader.pos > 0 && path_reader.pos < node.key.bytesize
# determine if current node key needs to be split to accomodate new # determine if current node key needs to be split to accomodate new
# children nodes # children nodes
@ -187,7 +187,7 @@ module Radix
node.sort! node.sort!
# determine if path still continues # determine if path still continues
if path_reader.pos < path.size if path_reader.pos < path.bytesize
new_key = path.byte_slice(path_reader.pos) new_key = path.byte_slice(path_reader.pos)
node.children << Node(T).new(new_key, payload) node.children << Node(T).new(new_key, payload)
node.sort! node.sort!
@ -237,7 +237,7 @@ module Radix
# special consideration when comparing the first node vs. others # special consideration when comparing the first node vs. others
# in case of node key and path being the same, return the node # in case of node key and path being the same, return the node
# instead of walking character by character # instead of walking character by character
if first && (path.size == node.key.size && path == node.key) && node.payload? if first && (path.bytesize == node.key.bytesize && path == node.key) && node.payload?
result.use node result.use node
return return
end end
@ -303,8 +303,8 @@ module Radix
# nodes # nodes
if path_reader.has_next? if path_reader.has_next?
# using trailing slash? # using trailing slash?
if node.key.size > 0 && if node.key.bytesize > 0 &&
path_reader.pos + 1 == path.size && path_reader.pos + 1 == path.bytesize &&
path_reader.current_char == '/' path_reader.current_char == '/'
result.use node result.use node
return return
@ -329,14 +329,14 @@ module Radix
# key still contains characters to walk # key still contains characters to walk
if key_reader.has_next? if key_reader.has_next?
# determine if there is just a trailing slash? # determine if there is just a trailing slash?
if key_reader.pos + 1 == node.key.size && if key_reader.pos + 1 == node.key.bytesize &&
key_reader.current_char == '/' key_reader.current_char == '/'
result.use node result.use node
return return
end end
# check if remaining part is catch all # check if remaining part is catch all
if key_reader.pos < node.key.size && if key_reader.pos < node.key.bytesize &&
((key_reader.current_char == '/' && key_reader.peek_next_char == '*') || ((key_reader.current_char == '/' && key_reader.peek_next_char == '*') ||
key_reader.current_char == '*') key_reader.current_char == '*')
# skip to '*' only if necessary # skip to '*' only if necessary