Improves support for non-ascii keys in a tree

Properly recognize and organize non-ascii keys into nodes, allowing usage with entries in other languages. With this change, it is possible to use 2 or 3 bytes wide characters (Unicode) without issues: tree = Radix::Tree(Symbol).new tree.add "/", :root tree.add "/日本語", :japanese tree.add "/日本は難しい", :japanese_is_difficult Which produces the following node hierarchy: # ( 1) / (:root) # ( 6) 日本 # (12) は難しい (:japanese_is_difficult) # ( 3) 語 (:japanese) And lookup works as expected: result = tree.find "/日本は難しい" puts result.found? # => true
2024-08-15 00:43:21 +00:00 · 2017-02-18 20:30:05 -03:00 · 2017-02-18 20:30:05 -03:00 · 97ef407aec
commit 97ef407aec
parent 7460033db3
3 changed files with 55 additions and 9 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -7,6 +7,7 @@ so please check *Changed* and *Removed* notes before upgrading.
 ## [Unreleased]
 ### Fixed
 - Correct lookup issue caused by incorrect comparison of shared key [#21](https://github.com/luislavena/radix/issues/21)
 - Improve support for non-ascii keys in a tree.
 ## [0.3.7] - 2017-02-04
 ### Fixed
--- a/spec/radix/tree_spec.cr
+++ b/spec/radix/tree_spec.cr
@ -172,6 +172,38 @@ module Radix
        end
      end
      context "dealing with unicode" do
        it "inserts properly adjacent parent nodes" do
          tree = Tree(Symbol).new
          tree.add "/", :root
          tree.add "/日本語", :japanese
          tree.add "/素晴らしい", :amazing
          # /          (:root)
          # +-素晴らしい    (:amazing)
          # \-日本語      (:japanese)
          tree.root.children.size.should eq(2)
          tree.root.children[0].key.should eq("素晴らしい")
          tree.root.children[1].key.should eq("日本語")
        end
        it "inserts nodes with shared parent" do
          tree = Tree(Symbol).new
          tree.add "/", :root
          tree.add "/日本語", :japanese
          tree.add "/日本は難しい", :japanese_is_difficult
          # /                (:root)
          # \-日本語            (:japanese)
          #     \-日本は難しい     (:japanese_is_difficult)
          tree.root.children.size.should eq(1)
          tree.root.children[0].key.should eq("日本")
          tree.root.children[0].children.size.should eq(2)
          tree.root.children[0].children[0].key.should eq("は難しい")
          tree.root.children[0].children[1].key.should eq("語")
        end
      end
      context "dealing with duplicates" do
        it "does not allow same path be defined twice" do
          tree = Tree(Symbol).new
@ -349,6 +381,19 @@ module Radix
        end
      end
      context "unicode nodes with shared parent" do
        it "finds matching path" do
          tree = Tree(Symbol).new
          tree.add "/", :root
          tree.add "/日本語", :japanese
          tree.add "/日本日本語は難しい", :japanese_is_difficult
          result = tree.find("/日本日本語は難しい/")
          result.found?.should be_true
          result.key.should eq("/日本日本語は難しい")
        end
      end
      context "dealing with catch all" do
        it "finds matching path" do
          tree = Tree(Symbol).new
--- a/src/radix/tree.cr
+++ b/src/radix/tree.cr
@ -125,7 +125,7 @@ module Radix
      # determine split point difference between path and key
      # compare if path is larger than key
      if path_reader.pos == 0 ||
-         (path_reader.pos < path.size && path_reader.pos >= node.key.size)
+         (path_reader.pos < path.bytesize && path_reader.pos >= node.key.bytesize)
        # determine if a child of this node contains the remaining part
        # of the path
        added = false
@ -156,7 +156,7 @@ module Radix
        # adjust priorities
        node.sort!
-      elsif path_reader.pos == path.size && path_reader.pos == node.key.size
+      elsif path_reader.pos == path.bytesize && path_reader.pos == node.key.bytesize
        # determine if path matches key and potentially be a duplicate
        # and raise if is the case
@ -166,7 +166,7 @@ module Radix
          # assign payload since this is an empty node
          node.payload = payload
        end
-      elsif path_reader.pos > 0 && path_reader.pos < node.key.size
+      elsif path_reader.pos > 0 && path_reader.pos < node.key.bytesize
        # determine if current node key needs to be split to accomodate new
        # children nodes
@ -187,7 +187,7 @@ module Radix
        node.sort!
        # determine if path still continues
-        if path_reader.pos < path.size
+        if path_reader.pos < path.bytesize
          new_key = path.byte_slice(path_reader.pos)
          node.children << Node(T).new(new_key, payload)
          node.sort!
@ -237,7 +237,7 @@ module Radix
      # special consideration when comparing the first node vs. others
      # in case of node key and path being the same, return the node
      # instead of walking character by character
-      if first && (path.size == node.key.size && path == node.key) && node.payload?
+      if first && (path.bytesize == node.key.bytesize && path == node.key) && node.payload?
        result.use node
        return
      end
@ -303,8 +303,8 @@ module Radix
      # nodes
      if path_reader.has_next?
        # using trailing slash?
-        if node.key.size > 0 &&
+        if node.key.bytesize > 0 &&
-           path_reader.pos + 1 == path.size &&
+           path_reader.pos + 1 == path.bytesize &&
           path_reader.current_char == '/'
          result.use node
          return
@ -329,14 +329,14 @@ module Radix
      # key still contains characters to walk
      if key_reader.has_next?
        # determine if there is just a trailing slash?
-        if key_reader.pos + 1 == node.key.size &&
+        if key_reader.pos + 1 == node.key.bytesize &&
           key_reader.current_char == '/'
          result.use node
          return
        end
        # check if remaining part is catch all
-        if key_reader.pos < node.key.size &&
+        if key_reader.pos < node.key.bytesize &&
           ((key_reader.current_char == '/' && key_reader.peek_next_char == '*') ||
           key_reader.current_char == '*')
          # skip to '*' only if necessary