Improves support for non-ascii keys in a tree

Properly recognize and organize non-ascii keys into nodes, allowing
usage with entries in other languages.

With this change, it is possible to use 2 or 3 bytes wide characters
(Unicode) without issues:

    tree = Radix::Tree(Symbol).new
    tree.add "/", :root
    tree.add "/日本語", :japanese
    tree.add "/日本は難しい", :japanese_is_difficult

Which produces the following node hierarchy:

    # ( 1) /       (:root)
    # ( 6)  日本
    # (12)    は難しい (:japanese_is_difficult)
    # ( 3)    語    (:japanese)

And lookup works as expected:

    result = tree.find "/日本は難しい"
    puts result.found? # => true
This commit is contained in:
Luis Lavena 2017-02-18 20:30:05 -03:00
parent 7460033db3
commit 97ef407aec
3 changed files with 55 additions and 9 deletions

View file

@ -7,6 +7,7 @@ so please check *Changed* and *Removed* notes before upgrading.
## [Unreleased]
### Fixed
- Correct lookup issue caused by incorrect comparison of shared key [#21](https://github.com/luislavena/radix/issues/21)
- Improve support for non-ascii keys in a tree.
## [0.3.7] - 2017-02-04
### Fixed

View file

@ -172,6 +172,38 @@ module Radix
end
end
context "dealing with unicode" do
it "inserts properly adjacent parent nodes" do
tree = Tree(Symbol).new
tree.add "/", :root
tree.add "/日本語", :japanese
tree.add "/素晴らしい", :amazing
# / (:root)
# +-素晴らしい (:amazing)
# \-日本語 (:japanese)
tree.root.children.size.should eq(2)
tree.root.children[0].key.should eq("素晴らしい")
tree.root.children[1].key.should eq("日本語")
end
it "inserts nodes with shared parent" do
tree = Tree(Symbol).new
tree.add "/", :root
tree.add "/日本語", :japanese
tree.add "/日本は難しい", :japanese_is_difficult
# / (:root)
# \-日本語 (:japanese)
# \-日本は難しい (:japanese_is_difficult)
tree.root.children.size.should eq(1)
tree.root.children[0].key.should eq("日本")
tree.root.children[0].children.size.should eq(2)
tree.root.children[0].children[0].key.should eq("は難しい")
tree.root.children[0].children[1].key.should eq("")
end
end
context "dealing with duplicates" do
it "does not allow same path be defined twice" do
tree = Tree(Symbol).new
@ -349,6 +381,19 @@ module Radix
end
end
context "unicode nodes with shared parent" do
it "finds matching path" do
tree = Tree(Symbol).new
tree.add "/", :root
tree.add "/日本語", :japanese
tree.add "/日本日本語は難しい", :japanese_is_difficult
result = tree.find("/日本日本語は難しい/")
result.found?.should be_true
result.key.should eq("/日本日本語は難しい")
end
end
context "dealing with catch all" do
it "finds matching path" do
tree = Tree(Symbol).new

View file

@ -125,7 +125,7 @@ module Radix
# determine split point difference between path and key
# compare if path is larger than key
if path_reader.pos == 0 ||
(path_reader.pos < path.size && path_reader.pos >= node.key.size)
(path_reader.pos < path.bytesize && path_reader.pos >= node.key.bytesize)
# determine if a child of this node contains the remaining part
# of the path
added = false
@ -156,7 +156,7 @@ module Radix
# adjust priorities
node.sort!
elsif path_reader.pos == path.size && path_reader.pos == node.key.size
elsif path_reader.pos == path.bytesize && path_reader.pos == node.key.bytesize
# determine if path matches key and potentially be a duplicate
# and raise if is the case
@ -166,7 +166,7 @@ module Radix
# assign payload since this is an empty node
node.payload = payload
end
elsif path_reader.pos > 0 && path_reader.pos < node.key.size
elsif path_reader.pos > 0 && path_reader.pos < node.key.bytesize
# determine if current node key needs to be split to accomodate new
# children nodes
@ -187,7 +187,7 @@ module Radix
node.sort!
# determine if path still continues
if path_reader.pos < path.size
if path_reader.pos < path.bytesize
new_key = path.byte_slice(path_reader.pos)
node.children << Node(T).new(new_key, payload)
node.sort!
@ -237,7 +237,7 @@ module Radix
# special consideration when comparing the first node vs. others
# in case of node key and path being the same, return the node
# instead of walking character by character
if first && (path.size == node.key.size && path == node.key) && node.payload?
if first && (path.bytesize == node.key.bytesize && path == node.key) && node.payload?
result.use node
return
end
@ -303,8 +303,8 @@ module Radix
# nodes
if path_reader.has_next?
# using trailing slash?
if node.key.size > 0 &&
path_reader.pos + 1 == path.size &&
if node.key.bytesize > 0 &&
path_reader.pos + 1 == path.bytesize &&
path_reader.current_char == '/'
result.use node
return
@ -329,14 +329,14 @@ module Radix
# key still contains characters to walk
if key_reader.has_next?
# determine if there is just a trailing slash?
if key_reader.pos + 1 == node.key.size &&
if key_reader.pos + 1 == node.key.bytesize &&
key_reader.current_char == '/'
result.use node
return
end
# check if remaining part is catch all
if key_reader.pos < node.key.size &&
if key_reader.pos < node.key.bytesize &&
((key_reader.current_char == '/' && key_reader.peek_next_char == '*') ||
key_reader.current_char == '*')
# skip to '*' only if necessary