From 97ef407aec729e71cd7cda43e58c2cb7380587f0 Mon Sep 17 00:00:00 2001
From: Luis Lavena <luislavena@gmail.com>
Date: Sat, 18 Feb 2017 20:30:05 -0300
Subject: [PATCH] Improves support for non-ascii keys in a tree
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Properly recognize and organize non-ascii keys into nodes, allowing
usage with entries in other languages.

With this change, it is possible to use 2 or 3 bytes wide characters
(Unicode) without issues:

    tree = Radix::Tree(Symbol).new
    tree.add "/", :root
    tree.add "/日本語", :japanese
    tree.add "/日本は難しい", :japanese_is_difficult

Which produces the following node hierarchy:

    # ( 1) /       (:root)
    # ( 6)  日本
    # (12)    は難しい (:japanese_is_difficult)
    # ( 3)    語    (:japanese)

And lookup works as expected:

    result = tree.find "/日本は難しい"
    puts result.found? # => true
---
 CHANGELOG.md            |  1 +
 spec/radix/tree_spec.cr | 45 +++++++++++++++++++++++++++++++++++++++++
 src/radix/tree.cr       | 18 ++++++++---------
 3 files changed, 55 insertions(+), 9 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index cc2579b..f44b5e9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@ so please check *Changed* and *Removed* notes before upgrading.
 ## [Unreleased]
 ### Fixed
 - Correct lookup issue caused by incorrect comparison of shared key [#21](https://github.com/luislavena/radix/issues/21)
+- Improve support for non-ascii keys in a tree.
 
 ## [0.3.7] - 2017-02-04
 ### Fixed
diff --git a/spec/radix/tree_spec.cr b/spec/radix/tree_spec.cr
index 3f100a7..8545f7a 100644
--- a/spec/radix/tree_spec.cr
+++ b/spec/radix/tree_spec.cr
@@ -172,6 +172,38 @@ module Radix
         end
       end
 
+      context "dealing with unicode" do
+        it "inserts properly adjacent parent nodes" do
+          tree = Tree(Symbol).new
+          tree.add "/", :root
+          tree.add "/日本語", :japanese
+          tree.add "/素晴らしい", :amazing
+
+          # /          (:root)
+          # +-素晴らしい    (:amazing)
+          # \-日本語      (:japanese)
+          tree.root.children.size.should eq(2)
+          tree.root.children[0].key.should eq("素晴らしい")
+          tree.root.children[1].key.should eq("日本語")
+        end
+
+        it "inserts nodes with shared parent" do
+          tree = Tree(Symbol).new
+          tree.add "/", :root
+          tree.add "/日本語", :japanese
+          tree.add "/日本は難しい", :japanese_is_difficult
+
+          # /                (:root)
+          # \-日本語            (:japanese)
+          #     \-日本は難しい     (:japanese_is_difficult)
+          tree.root.children.size.should eq(1)
+          tree.root.children[0].key.should eq("日本")
+          tree.root.children[0].children.size.should eq(2)
+          tree.root.children[0].children[0].key.should eq("は難しい")
+          tree.root.children[0].children[1].key.should eq("語")
+        end
+      end
+
       context "dealing with duplicates" do
         it "does not allow same path be defined twice" do
           tree = Tree(Symbol).new
@@ -349,6 +381,19 @@ module Radix
         end
       end
 
+      context "unicode nodes with shared parent" do
+        it "finds matching path" do
+          tree = Tree(Symbol).new
+          tree.add "/", :root
+          tree.add "/日本語", :japanese
+          tree.add "/日本日本語は難しい", :japanese_is_difficult
+
+          result = tree.find("/日本日本語は難しい/")
+          result.found?.should be_true
+          result.key.should eq("/日本日本語は難しい")
+        end
+      end
+
       context "dealing with catch all" do
         it "finds matching path" do
           tree = Tree(Symbol).new
diff --git a/src/radix/tree.cr b/src/radix/tree.cr
index f65cb54..09d93fb 100644
--- a/src/radix/tree.cr
+++ b/src/radix/tree.cr
@@ -125,7 +125,7 @@ module Radix
       # determine split point difference between path and key
       # compare if path is larger than key
       if path_reader.pos == 0 ||
-         (path_reader.pos < path.size && path_reader.pos >= node.key.size)
+         (path_reader.pos < path.bytesize && path_reader.pos >= node.key.bytesize)
         # determine if a child of this node contains the remaining part
         # of the path
         added = false
@@ -156,7 +156,7 @@ module Radix
 
         # adjust priorities
         node.sort!
-      elsif path_reader.pos == path.size && path_reader.pos == node.key.size
+      elsif path_reader.pos == path.bytesize && path_reader.pos == node.key.bytesize
         # determine if path matches key and potentially be a duplicate
         # and raise if is the case
 
@@ -166,7 +166,7 @@ module Radix
           # assign payload since this is an empty node
           node.payload = payload
         end
-      elsif path_reader.pos > 0 && path_reader.pos < node.key.size
+      elsif path_reader.pos > 0 && path_reader.pos < node.key.bytesize
         # determine if current node key needs to be split to accomodate new
         # children nodes
 
@@ -187,7 +187,7 @@ module Radix
         node.sort!
 
         # determine if path still continues
-        if path_reader.pos < path.size
+        if path_reader.pos < path.bytesize
           new_key = path.byte_slice(path_reader.pos)
           node.children << Node(T).new(new_key, payload)
           node.sort!
@@ -237,7 +237,7 @@ module Radix
       # special consideration when comparing the first node vs. others
       # in case of node key and path being the same, return the node
       # instead of walking character by character
-      if first && (path.size == node.key.size && path == node.key) && node.payload?
+      if first && (path.bytesize == node.key.bytesize && path == node.key) && node.payload?
         result.use node
         return
       end
@@ -303,8 +303,8 @@ module Radix
       # nodes
       if path_reader.has_next?
         # using trailing slash?
-        if node.key.size > 0 &&
-           path_reader.pos + 1 == path.size &&
+        if node.key.bytesize > 0 &&
+           path_reader.pos + 1 == path.bytesize &&
            path_reader.current_char == '/'
           result.use node
           return
@@ -329,14 +329,14 @@ module Radix
       # key still contains characters to walk
       if key_reader.has_next?
         # determine if there is just a trailing slash?
-        if key_reader.pos + 1 == node.key.size &&
+        if key_reader.pos + 1 == node.key.bytesize &&
            key_reader.current_char == '/'
           result.use node
           return
         end
 
         # check if remaining part is catch all
-        if key_reader.pos < node.key.size &&
+        if key_reader.pos < node.key.bytesize &&
            ((key_reader.current_char == '/' && key_reader.peek_next_char == '*') ||
            key_reader.current_char == '*')
           # skip to '*' only if necessary