classifier-reborn
classifier-reborn copied to clipboard
"ArgumentError: comparison of Float with NaN failed" if trying to search a corpus with an item that lacks common words
require 'classifier-reborn'
lsi = ClassifierReborn::LSI.new
strings = [
"This is filler text that I invented.This is also a paragraph that could be used",
"This post is amazing. Please take a look",
"For all sports fan, you must watch this video. Hey you have to check this out."
]
strings.each { |x| lsi.add_item x}
p lsi
#<ClassifierReborn::LSI:0x007fdcfc9af868 @auto_rebuild=true, @word_list=#<ClassifierReborn::WordList:0x007fdcfe80afd8 @location_table={:filler=>0, :text=>1, :inventedthi=>2, :paragraph=>3, :could=>4, :us=>5, :post=>6, :amaz=>7, :pleas=>8, :take=>9, :look=>10, :for=>11, :sport=>12, :fan=>13, :must=>14, :watch=>15, :video=>16, :hei=>17, :check=>18, :out=>19}>, @items={"This is filler text that I invented.This is also a paragraph that could be used"=>#<ClassifierReborn::ContentNode:0x007fdcfd05ea88 @categories=[], @word_hash={:filler=>1, :text=>1, :inventedthi=>1, :paragraph=>1, :could=>1, :us=>1}, @lsi_norm=GSL::Vector
# [ 4.082e-01 4.082e-01 4.082e-01 4.082e-01 4.082e-01 4.082e-01 0.000e+00 ... ], @lsi_vector=GSL::Vector
# [ 3.869e-01 3.869e-01 3.869e-01 3.869e-01 3.869e-01 3.869e-01 0.000e+00 ... ], @raw_norm=GSL::Vector
# [ 4.082e-01 4.082e-01 4.082e-01 4.082e-01 4.082e-01 4.082e-01 0.000e+00 ... ], @raw_vector=GSL::Vector
# [ 3.869e-01 3.869e-01 3.869e-01 3.869e-01 3.869e-01 3.869e-01 0.000e+00 ... ]>, "This post is amazing. Please take a look"=>#<ClassifierReborn::ContentNode:0x007fdcfd05c918 @categories=[], @word_hash={:post=>1, :amaz=>1, :pleas=>1, :take=>1, :look=>1}, @lsi_norm=GSL::Vector
# [ 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 4.472e-01 ... ], @lsi_vector=GSL::Vector
# [ 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 4.307e-01 ... ], @raw_norm=GSL::Vector
# [ 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 4.472e-01 ... ], @raw_vector=GSL::Vector
# [ 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 4.307e-01 ... ]>, "For all sports fan, you must watch this video. Hey you have to check this out."=>#<ClassifierReborn::ContentNode:0x007fdcfe80b050 @categories=[], @word_hash={:for=>1, :sport=>1, :fan=>1, :must=>1, :watch=>1, :video=>1, :hei=>1, :check=>1, :out=>1}, @lsi_norm=GSL::Vector
# [ nan nan nan nan nan nan nan ... ], @lsi_vector=GSL::Vector
# [ 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 ... ], @raw_norm=GSL::Vector
# [ 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 ... ], @raw_vector=GSL::Vector
# [ 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 ... ]>}, @version=3, @built_at_version=3, @language="en", @cache_node_vectors=nil>
# Because the last content node has a GSL::Vector consisting of NaNs,
# this following code will raise an exception
p lsi.search('filler')
#ArgumentError: comparison of Float with NaN failed
# from /Users/tariqali/.rbenv/versions/2.4.1/lib/ruby/gems/2.4.0/gems/classifier-reborn-2.2.0/lib/classifier-reborn/lsi.rb:225:in `sort_by'
# from /Users/tariqali/.rbenv/versions/2.4.1/lib/ruby/gems/2.4.0/gems/classifier-reborn-2.2.0/lib/classifier-reborn/lsi.rb:225:in `content_node_norms'
# from /Users/tariqali/.rbenv/versions/2.4.1/lib/ruby/gems/2.4.0/gems/classifier-reborn-2.2.0/lib/classifier-reborn/lsi.rb:211:in `proximity_norms_for_content'
# from /Users/tariqali/.rbenv/versions/2.4.1/lib/ruby/gems/2.4.0/gems/classifier-reborn-2.2.0/lib/classifier-reborn/lsi.rb:237:in `search'
# from (irb):13
# from /Users/tariqali/.rbenv/versions/2.4.1/bin/irb:11:in `<main>'
# But if I add some new content...
lsi.add_item "I love sports"
p lsi
# #<ClassifierReborn::LSI:0x007fdcfc9af868 @auto_rebuild=true, @word_list=#<ClassifierReborn::WordList:0x007fdcfd08fe80 @location_table={:filler=>0, :text=>1, :inventedthi=>2, :paragraph=>3, :could=>4, :us=>5, :post=>6, :amaz=>7, :pleas=>8, :take=>9, :look=>10, :for=>11, :sport=>12, :fan=>13, :must=>14, :watch=>15, :video=>16, :hei=>17, :check=>18, :out=>19, :love=>20}>, @items={"This is filler text that I invented.This is also a paragraph that could be used"=>#<ClassifierReborn::ContentNode:0x007fdcfd05ea88 @categories=[], @word_hash={:filler=>1, :text=>1, :inventedthi=>1, :paragraph=>1, :could=>1, :us=>1}, @lsi_norm=GSL::Vector
# [ 4.082e-01 4.082e-01 4.082e-01 4.082e-01 4.082e-01 4.082e-01 0.000e+00 ... ], @lsi_vector=GSL::Vector
# [ 3.869e-01 3.869e-01 3.869e-01 3.869e-01 3.869e-01 3.869e-01 0.000e+00 ... ], @raw_norm=GSL::Vector
# [ 4.082e-01 4.082e-01 4.082e-01 4.082e-01 4.082e-01 4.082e-01 0.000e+00 ... ], @raw_vector=GSL::Vector
# [ 3.869e-01 3.869e-01 3.869e-01 3.869e-01 3.869e-01 3.869e-01 0.000e+00 ... ]>, "This post is amazing. Please take a look"=>#<ClassifierReborn::ContentNode:0x007fdcfd05c918 @categories=[], @word_hash={:post=>1, :amaz=>1, :pleas=>1, :take=>1, :look=>1}, @lsi_norm=GSL::Vector
# [ 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 4.472e-01 ... ], @lsi_vector=GSL::Vector
# [ 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 4.307e-01 ... ], @raw_norm=GSL::Vector
# [ 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 4.472e-01 ... ], @raw_vector=GSL::Vector
# [ 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 4.307e-01 ... ]>, "For all sports fan, you must watch this video. Hey you have to check this out."=>#<ClassifierReborn::ContentNode:0x007fdcfe80b050 @categories=[], @word_hash={:for=>1, :sport=>1, :fan=>1, :must=>1, :watch=>1, :video=>1, :hei=>1, :check=>1, :out=>1}, @lsi_norm=GSL::Vector
# [ 1.303e-17 3.778e-18 -2.815e-17 3.778e-18 3.778e-18 3.778e-18 0.000e+00 ... ], @lsi_vector=GSL::Vector
# [ 4.828e-18 1.400e-18 -1.043e-17 1.400e-18 1.400e-18 1.400e-18 0.000e+00 ... ], @raw_norm=GSL::Vector
# [ 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 ... ], @raw_vector=GSL::Vector
# [ 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 ... ]>, "I love sports"=>#<ClassifierReborn::ContentNode:0x007fdcfd08ff20 @categories=[], @word_hash={:love=>1, :sport=>1}, @lsi_norm=GSL::Vector
# [ 1.303e-17 3.778e-18 -2.815e-17 3.778e-18 3.778e-18 3.778e-18 0.000e+00 ... ], @lsi_vector=GSL::Vector
# [ 1.818e-17 5.272e-18 -3.927e-17 5.272e-18 5.272e-18 5.272e-18 0.000e+00 ... ], @raw_norm=GSL::Vector
# [ 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 ... ], @raw_vector=GSL::Vector
# [ 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 ... ]>}, @version=4, @built_at_version=4, @language="en", @cache_node_vectors=nil>
# Now, no content node has NaNs, so I'm able to do a successful search.
p lsi.search('filler')
# ["This is filler text that I invented.This is also a paragraph that could be used", "I love sports", "For all sports fan, you must watch this video. Hey you have to check this out."]
I've encountered this error in https://github.com/tra38/ZombieWriter/issues/8 . What would be the best way of handling this issue? My first instinct is to simply add all my strings and then scan through the entire LSI again to remove all content that contain vectors of NaNs, but that seems exceedingly inelegant...there has to be a better solution, right?
There's probably a case for cleaning out NaNs as we go in the library, but I'm not sure.
I'm getting the same issue.
I'm all for cleaning these up, but it may change behavior. I'm happy to review/merge a PR if either @tra38 or @epugh want to submit one.