From 97ef407aec729e71cd7cda43e58c2cb7380587f0 Mon Sep 17 00:00:00 2001 From: Luis Lavena Date: Sat, 18 Feb 2017 20:30:05 -0300 Subject: [PATCH] Improves support for non-ascii keys in a tree MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Properly recognize and organize non-ascii keys into nodes, allowing usage with entries in other languages. With this change, it is possible to use 2 or 3 bytes wide characters (Unicode) without issues: tree = Radix::Tree(Symbol).new tree.add "/", :root tree.add "/日本語", :japanese tree.add "/日本は難しい", :japanese_is_difficult Which produces the following node hierarchy: # ( 1) / (:root) # ( 6) 日本 # (12) は難しい (:japanese_is_difficult) # ( 3) 語 (:japanese) And lookup works as expected: result = tree.find "/日本は難しい" puts result.found? # => true --- CHANGELOG.md | 1 + spec/radix/tree_spec.cr | 45 +++++++++++++++++++++++++++++++++++++++++ src/radix/tree.cr | 18 ++++++++--------- 3 files changed, 55 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cc2579b..f44b5e9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ so please check *Changed* and *Removed* notes before upgrading. ## [Unreleased] ### Fixed - Correct lookup issue caused by incorrect comparison of shared key [#21](https://github.com/luislavena/radix/issues/21) +- Improve support for non-ascii keys in a tree. ## [0.3.7] - 2017-02-04 ### Fixed diff --git a/spec/radix/tree_spec.cr b/spec/radix/tree_spec.cr index 3f100a7..8545f7a 100644 --- a/spec/radix/tree_spec.cr +++ b/spec/radix/tree_spec.cr @@ -172,6 +172,38 @@ module Radix end end + context "dealing with unicode" do + it "inserts properly adjacent parent nodes" do + tree = Tree(Symbol).new + tree.add "/", :root + tree.add "/日本語", :japanese + tree.add "/素晴らしい", :amazing + + # / (:root) + # +-素晴らしい (:amazing) + # \-日本語 (:japanese) + tree.root.children.size.should eq(2) + tree.root.children[0].key.should eq("素晴らしい") + tree.root.children[1].key.should eq("日本語") + end + + it "inserts nodes with shared parent" do + tree = Tree(Symbol).new + tree.add "/", :root + tree.add "/日本語", :japanese + tree.add "/日本は難しい", :japanese_is_difficult + + # / (:root) + # \-日本語 (:japanese) + # \-日本は難しい (:japanese_is_difficult) + tree.root.children.size.should eq(1) + tree.root.children[0].key.should eq("日本") + tree.root.children[0].children.size.should eq(2) + tree.root.children[0].children[0].key.should eq("は難しい") + tree.root.children[0].children[1].key.should eq("語") + end + end + context "dealing with duplicates" do it "does not allow same path be defined twice" do tree = Tree(Symbol).new @@ -349,6 +381,19 @@ module Radix end end + context "unicode nodes with shared parent" do + it "finds matching path" do + tree = Tree(Symbol).new + tree.add "/", :root + tree.add "/日本語", :japanese + tree.add "/日本日本語は難しい", :japanese_is_difficult + + result = tree.find("/日本日本語は難しい/") + result.found?.should be_true + result.key.should eq("/日本日本語は難しい") + end + end + context "dealing with catch all" do it "finds matching path" do tree = Tree(Symbol).new diff --git a/src/radix/tree.cr b/src/radix/tree.cr index f65cb54..09d93fb 100644 --- a/src/radix/tree.cr +++ b/src/radix/tree.cr @@ -125,7 +125,7 @@ module Radix # determine split point difference between path and key # compare if path is larger than key if path_reader.pos == 0 || - (path_reader.pos < path.size && path_reader.pos >= node.key.size) + (path_reader.pos < path.bytesize && path_reader.pos >= node.key.bytesize) # determine if a child of this node contains the remaining part # of the path added = false @@ -156,7 +156,7 @@ module Radix # adjust priorities node.sort! - elsif path_reader.pos == path.size && path_reader.pos == node.key.size + elsif path_reader.pos == path.bytesize && path_reader.pos == node.key.bytesize # determine if path matches key and potentially be a duplicate # and raise if is the case @@ -166,7 +166,7 @@ module Radix # assign payload since this is an empty node node.payload = payload end - elsif path_reader.pos > 0 && path_reader.pos < node.key.size + elsif path_reader.pos > 0 && path_reader.pos < node.key.bytesize # determine if current node key needs to be split to accomodate new # children nodes @@ -187,7 +187,7 @@ module Radix node.sort! # determine if path still continues - if path_reader.pos < path.size + if path_reader.pos < path.bytesize new_key = path.byte_slice(path_reader.pos) node.children << Node(T).new(new_key, payload) node.sort! @@ -237,7 +237,7 @@ module Radix # special consideration when comparing the first node vs. others # in case of node key and path being the same, return the node # instead of walking character by character - if first && (path.size == node.key.size && path == node.key) && node.payload? + if first && (path.bytesize == node.key.bytesize && path == node.key) && node.payload? result.use node return end @@ -303,8 +303,8 @@ module Radix # nodes if path_reader.has_next? # using trailing slash? - if node.key.size > 0 && - path_reader.pos + 1 == path.size && + if node.key.bytesize > 0 && + path_reader.pos + 1 == path.bytesize && path_reader.current_char == '/' result.use node return @@ -329,14 +329,14 @@ module Radix # key still contains characters to walk if key_reader.has_next? # determine if there is just a trailing slash? - if key_reader.pos + 1 == node.key.size && + if key_reader.pos + 1 == node.key.bytesize && key_reader.current_char == '/' result.use node return end # check if remaining part is catch all - if key_reader.pos < node.key.size && + if key_reader.pos < node.key.bytesize && ((key_reader.current_char == '/' && key_reader.peek_next_char == '*') || key_reader.current_char == '*') # skip to '*' only if necessary