Select request doesn't return expected number of documents
Steps to reproduce:
- Insert about a million documents having different uniqueKey (e.g. uniqueKey = 0000001, 0000002, .... 1000000) in a fresh installed Solandra DB.
- Query ../select/?q=:
Actual result: numDocs value returned as a response for ../select/?q=: is less than million (about 900000) Expected result: One million documents must be returned.
Additional Info:
Cassandra version: 0.7.6 Solr version: 3.1.0 Solandra: tjake-Solandra-00b188b
You can get all the documents posted to the DB using ranged query, e.g. ../select/?q=id:[00000001 TO 00100000], ../select/?q=id:[00100001 TO 00200000], ...
You can generate a test xml using the following program:
package xmlgenerator;
import java.io.*; import java.util.Random;
import javax.xml.stream.XMLOutputFactory; import javax.xml.stream.XMLStreamWriter;
public class XMLGenerator { private static Random m_randomGenerator = new Random (); private static final int maxRandomInt = 100000000;
private static String readFileAsString (String filePath) throws java.io.IOException {
byte [] buffer = new byte [(int) new File (filePath).length ()];
BufferedInputStream f = null;
try {
f = new BufferedInputStream (new FileInputStream (filePath));
f.read (buffer);
} finally {
if (f != null) try {f.close ();} catch (IOException ignored) { }
}
return new String (buffer);
}
public static void addRandomDocument (XMLStreamWriter xmlWriter, String [] words, int id) throws Exception {
xmlWriter.writeCharacters ("\n");
xmlWriter.writeStartElement ("http://www.w3.org/TR/REC-html40", "doc");
xmlWriter.writeCharacters ("\n");
xmlWriter.writeStartElement ("http://www.w3.org/TR/REC-html40", "field");
xmlWriter.writeAttribute ("name", "id");
xmlWriter.writeCharacters (String.format ("%08d", id));
xmlWriter.writeEndElement ();
xmlWriter.writeCharacters ("\n");
xmlWriter.writeStartElement ("http://www.w3.org/TR/REC-html40", "field");
xmlWriter.writeAttribute ("name", "title");
xmlWriter.writeCharacters ("Document: " + String.format ("%08d", id));
xmlWriter.writeEndElement ();
xmlWriter.writeCharacters ("\n");
String [] additionalItems = {"Item1", "Item2", "Item3",
"Item4", "Item5", "Item6",
"Item7", "Item8", "Item9"};
for (String item : additionalItems) {
xmlWriter.writeStartElement ("http://www.w3.org/TR/REC-html40", "field");
xmlWriter.writeAttribute ("name", item);
xmlWriter.writeCharacters (String.format ("%08d", id)
+ "-"
+ words [m_randomGenerator.nextInt (words.length)]
+ "-"
+ String.format ("%08d", m_randomGenerator.nextInt (maxRandomInt))
+ "-"
+ words [m_randomGenerator.nextInt (words.length)]);
xmlWriter.writeEndElement ();
xmlWriter.writeCharacters ("\n");
}
xmlWriter.writeEndElement ();
xmlWriter.writeCharacters ("\n");
}
private static void help () {
System.out.println ("Usage: <xmlgenerator> <text file> <output xml> <number_of_documents>");
}
public static void main (String [] args) throws IOException {
if (args.length != 3) {
help ();
return;
}
String text = null;
try {
text = readFileAsString (args [0]);
} catch (IOException ignored) { }
if (text == null) {
System.out.println (args [0] + " is empty");
return;
}
String [] words = text.split ("[\\s]+");
if (words.length < 100) {
System.out.println (args [0] + " is not big enough");
return;
}
int numberOfDocuments = Integer.parseInt (args [2]);
XMLStreamWriter xmlWriter = null;
try {
xmlWriter = XMLOutputFactory.newInstance ().createXMLStreamWriter (new FileWriter (args [1]));
xmlWriter.writeStartDocument ("utf-8", "1.0");
xmlWriter.writeCharacters ("\n");
xmlWriter.setPrefix ("", "http://www.w3.org/TR/REC-html40");
xmlWriter.writeStartElement ("http://www.w3.org/TR/REC-html40", "add");
while (numberOfDocuments-- > 0) addRandomDocument (xmlWriter, words, numberOfDocuments);
xmlWriter.writeEndElement ();
xmlWriter.writeEndDocument ();
xmlWriter.flush ();
xmlWriter.close ();
} catch (Exception ignored) { }
}
}
Usage: ... XMLGenerator.jar XMLGenerator.java sample.xml 1000000
Schema used:
<!-- The StrField type is not analyzed, but indexed/stored verbatim.
- StrField and TextField support an optional compressThreshold which
limits compression (if enabled in the derived fields) to values which
exceed a certain size (in characters).
-->
<fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
<!-- boolean type: "true" or "false" -->
<fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" omitNorms="true"/>
<!-- The optional sortMissingLast and sortMissingFirst attributes are
currently supported on types that are sorted internally as strings.
- If sortMissingLast="true", then a sort on this field will cause documents
without the field to come after documents with the field,
regardless of the requested sort order (asc or desc).
- If sortMissingFirst="true", then a sort on this field will cause documents
without the field to come before documents with the field,
regardless of the requested sort order.
- If sortMissingLast="false" and sortMissingFirst="false" (the default),
then default lucene sorting will be used which places docs without the
field first in an ascending sort and last in a descending sort.
-->
<!-- numeric field types that store and index the text
value verbatim (and hence don't support range queries, since the
lexicographic ordering isn't equal to the numeric ordering) -->
<fieldType name="integer" class="solr.IntField" omitNorms="true"/>
<fieldType name="long" class="solr.LongField" omitNorms="true"/>
<fieldType name="float" class="solr.FloatField" omitNorms="true"/>
<fieldType name="double" class="solr.DoubleField" omitNorms="true"/>
<!-- Numeric field types that manipulate the value into
a string value that isn't human-readable in its internal form,
but with a lexicographic ordering the same as the numeric ordering,
so that range queries work correctly. -->
<fieldType name="sint" class="solr.SortableIntField" sortMissingLast="true" omitNorms="true"/>
<fieldType name="slong" class="solr.SortableLongField" sortMissingLast="true" omitNorms="true"/>
<fieldType name="sfloat" class="solr.SortableFloatField" sortMissingLast="true" omitNorms="true"/>
<fieldType name="sdouble" class="solr.SortableDoubleField" sortMissingLast="true" omitNorms="true"/>
<!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and
is a more restricted form of the canonical representation of dateTime
http://www.w3.org/TR/xmlschema-2/#dateTime
The trailing "Z" designates UTC time and is mandatory.
Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z
All other components are mandatory.
Expressions can also be used to denote calculations that should be
performed relative to "NOW" to determine the value, ie...
NOW/HOUR
... Round to the start of the current hour
NOW-1DAY
... Exactly 1 day prior to now
NOW/DAY+6MONTHS+3DAYS
... 6 months and 3 days in the future from the start of
the current day
Consult the DateField javadocs for more information.
-->
<fieldType name="date" class="solr.DateField" sortMissingLast="true" omitNorms="true"/>
<!-- The "RandomSortField" is not used to store or search any
data. You can declare fields of this type it in your schema
to generate psuedo-random orderings of your docs for sorting
purposes. The ordering is generated based on the field name
and the version of the index, As long as the index version
remains unchanged, and the same field name is reused,
the ordering of the docs will be consistent.
If you want differend psuedo-random orderings of documents,
for the same version of the index, use a dynamicField and
change the name
-->
<fieldType name="random" class="solr.RandomSortField" indexed="true" />
<!-- solr.TextField allows the specification of custom text analyzers
specified as a tokenizer and a list of token filters. Different
analyzers may be specified for indexing and querying.
The optional positionIncrementGap puts space between multiple fields of
this type on the same document, with the purpose of preventing false phrase
matching across fields.
For more info on customizing your analyzer chain, please see
http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters
-->
<!-- One can also specify an existing Analyzer class that has a
default constructor via the class attribute on the analyzer element
<fieldType name="text_greek" class="solr.TextField">
<analyzer class="org.apache.lucene.analysis.el.GreekAnalyzer"/>
</fieldType>
-->
<!-- A text field that only splits on whitespace for exact matching of words -->
<fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
</analyzer>
</fieldType>
<!-- A text field that uses WordDelimiterFilter to enable splitting and matching of
words on case-change, alpha numeric boundaries, and non-alphanumeric chars,
so that a query of "wifi" or "wi fi" could match a document containing "Wi-Fi".
Synonyms and stopwords are customized by external files, and stemming is enabled.
Duplicate tokens at the same position (which may result from Stemmed Synonyms or
WordDelim parts) are removed.
-->
<fieldType name="text" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<!-- in this example, we will only use synonyms at query time
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
-->
<!-- Case insensitive stop word removal.
enablePositionIncrements=true ensures that a 'gap' is left to
allow for accurate phrase queries.
-->
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="stopwords.txt"
enablePositionIncrements="true"
/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>
<!-- Less flexible matching, but less false matches. Probably not ideal for product names,
but may be good for SKUs. Can insert dashes in the wrong place and still match. -->
<fieldType name="textTight" class="solr.TextField" positionIncrementGap="100" >
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>
<!--
Setup simple analysis for spell checking
-->
<fieldType name="textSpell" class="solr.TextField" positionIncrementGap="100" >
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>
<!-- This is an example of using the KeywordTokenizer along
With various TokenFilterFactories to produce a sortable field
that does not include some properties of the source text
-->
<fieldType name="alphaOnlySort" class="solr.TextField" sortMissingLast="true" omitNorms="true">
<analyzer>
<!-- KeywordTokenizer does no actual tokenizing, so the entire
input string is preserved as a single token
-->
<tokenizer class="solr.KeywordTokenizerFactory"/>
<!-- The LowerCase TokenFilter does what you expect, which can be
when you want your sorting to be case insensitive
-->
<filter class="solr.LowerCaseFilterFactory" />
<!-- The TrimFilter removes any leading or trailing whitespace -->
<filter class="solr.TrimFilterFactory" />
<!-- The PatternReplaceFilter gives you the flexibility to use
Java Regular expression to replace any sequence of characters
matching a pattern with an arbitrary replacement string,
which may include back refrences to portions of the orriginal
string matched by the pattern.
See the Java Regular Expression documentation for more
infomation on pattern and replacement string syntax.
http://java.sun.com/j2se/1.5.0/docs/api/java/util/regex/package-summary.html
-->
<filter class="solr.PatternReplaceFilterFactory"
pattern="([^a-z])" replacement="" replace="all"
/>
</analyzer>
</fieldType>
<!-- since fields of this type are by default not stored or indexed, any data added to
them will be ignored outright
-->
<fieldtype name="ignored" stored="false" indexed="false" class="solr.StrField" />
<uniqueKey>id</uniqueKey>
<defaultSearchField>id</defaultSearchField>
Hi,
Are you using the latest code? I ran your test and it matched 1M
oh tjake-Solandra-00b188b is a bit old... Can you try with the latest?
Thank you Jake for your quick response. I tried the latest Solandra build. It works with the xml generated. But I increased a number of items in documents and size of the items also. The bug can be reproduced again on the latest Solandra build.
All the code change were made in addRandomDocument () :
public static void addRandomDocument (XMLStreamWriter xmlWriter, String [] words, int id) throws Exception {
xmlWriter.writeCharacters ("\n");
xmlWriter.writeStartElement ("http://www.w3.org/TR/REC-html40", "doc");
xmlWriter.writeCharacters ("\n");
xmlWriter.writeStartElement ("http://www.w3.org/TR/REC-html40", "field");
xmlWriter.writeAttribute ("name", "id");
xmlWriter.writeCharacters (String.format ("%08d", id));
xmlWriter.writeEndElement ();
xmlWriter.writeCharacters ("\n");
xmlWriter.writeStartElement ("http://www.w3.org/TR/REC-html40", "field");
xmlWriter.writeAttribute ("name", "title");
xmlWriter.writeCharacters ("Document: " + String.format ("%08d", id));
xmlWriter.writeEndElement ();
xmlWriter.writeCharacters ("\n");
String [] additionalItems = {"Item1", "Item2", "Item3",
"Item4", "Item5", "Item6",
"Item7", "Item8", "Item9",
"Item10", "Item11", "Item12",
"Item13", "Item14", "Item15",
"Item16", "Item17", "Item18",
"Item19", "Item20"};
for (String item : additionalItems) {
xmlWriter.writeStartElement ("http://www.w3.org/TR/REC-html40", "field");
xmlWriter.writeAttribute ("name", item);
xmlWriter.writeCharacters (String.format ("%08d", id)
+ "-"
+ words [m_randomGenerator.nextInt (words.length)]
+ "-"
+ String.format ("%08d", m_randomGenerator.nextInt (maxRandomInt))
+ "-"
+ words [m_randomGenerator.nextInt (words.length)]
+ "-"
+ String.format ("%08d", m_randomGenerator.nextInt (maxRandomInt))
+ "-"
+ words [m_randomGenerator.nextInt (words.length)]
+ "-"
+ String.format ("%08d", m_randomGenerator.nextInt (maxRandomInt))
+ "-"
+ words [m_randomGenerator.nextInt (words.length)]);
xmlWriter.writeEndElement ();
xmlWriter.writeCharacters ("\n");
}
xmlWriter.writeEndElement ();
xmlWriter.writeCharacters ("\n");
}
Also the schema was changed so as to index the new Items added. Actual result: numDocs value returned as a response for ../select/?q=: is less than million (about 960000) O/S Platform: Windows Server 2008.
Are there any errors the log?
I don't see any errors in the log. All insert requests have status=0. This is the tail of the log containing ../select/?q=: : ..... INFO 10:36:44,291 {add=[00000199, 00000198, 00000197, 00000196, 00000195, 00000194, 00000193, 00000192, ... (200 adds)]} 0 59961 INFO 10:36:44,291 [solr_data] webapp=/solandra path=/update params={} status=0 QTime=59961 INFO 10:36:53,126 Completed flushing \tmp\cassandra-data\data\L\TI-g-259-Data.db (81349402 bytes) INFO 10:36:53,126 Discarding obsolete commit log:CommitLogSegment(C:\tmp\cassandra-data\commitlog\CommitLog-1308159375897.log) INFO 10:36:53,126 Discarding obsolete commit log:CommitLogSegment(C:\tmp\cassandra-data\commitlog\CommitLog-1308159386678.log) INFO 10:37:04,742 Compacted to \tmp\cassandra-data\data\L\TI-tmp-g-251-Data.db. 320,647,592 to 317,068,626 (~98% of original) bytes for 1,987,347 keys. Time: 163,491ms. INFO 10:37:04,773 Minor@631894615(L, TI, 172868922/329115236) now compacting at 16777 bytes/ms. INFO 10:37:50,982 Compacted to \tmp\cassandra-data\data\L\TI-tmp-g-256-Data.db. 329,115,236 to 324,791,074 (~98% of original) bytes for 2,038,944 keys. Time: 119,367ms. INFO 01:34:20,014 ShardInfo for solr_data has expired INFO 01:34:20,014 Found reserved shard10(147190272376517611437769280282474584895):131070 TO 131072 INFO 01:34:20,014 Found reserved shard11(147190272376517611437769280282474584895):131066 TO 131072 INFO 01:34:20,014 Found reserved shard12(147190272376517611437769280282474584895):119878 TO 131072 INFO 01:34:20,014 Found reserved shard13(147190272376517611437769280282474584895):37957 TO 49152 INFO 01:34:20,014 Found reserved shard14(147190272376517611437769280282474584895):127803 TO 131072 INFO 01:34:20,014 Found reserved shard16(147190272376517611437769280282474584895):103787 TO 114688 INFO 01:34:20,014 Found reserved shard5(147190272376517611437769280282474584895):119860 TO 131072 INFO 01:34:20,014 Found reserved shard6(147190272376517611437769280282474584895):131060 TO 131072 INFO 01:34:20,014 Found reserved shard7(147190272376517611437769280282474584895):119886 TO 131072 INFO 01:34:20,014 Found reserved shard9(147190272376517611437769280282474584895):119413 TO 131072 INFO 01:34:20,014 solr_data has 20 shards INFO 01:34:21,028 [solr_data] webapp=/solandra path=/select params={fl=id,score&start=0&q=:&isShard=true&wt=javabin&fsv=true&rows=10&version=2} hits=5481 status=0 QTime=312 INFO 01:34:21,855 [solr_data] webapp=/solandra path=/select params={fl=id,score&start=0&q=:&isShard=true&wt=javabin&fsv=true&rows=10&version=2} hits=29496 status=0 QTime=1139 INFO 01:34:21,995 [solr_data] webapp=/solandra path=/select params={fl=id,score&start=0&q=:&isShard=true&wt=javabin&fsv=true&rows=10&version=2} hits=29859 status=0 QTime=1279 INFO 01:34:22,135 [solr_data] webapp=/solandra path=/select params={fl=id,score&start=0&q=:&isShard=true&wt=javabin&fsv=true&rows=10&version=2} hits=32765 status=0 QTime=1419 INFO 01:34:22,525 [solr_data] webapp=/solandra path=/select params={fl=id,score&start=0&q=:&isShard=true&wt=javabin&fsv=true&rows=10&version=2} hits=32768 status=0 QTime=1809 INFO 01:34:22,837 [solr_data] webapp=/solandra path=/select params={fl=id,score&start=0&q=:&isShard=true&wt=javabin&fsv=true&rows=10&version=2} hits=55281 status=0 QTime=2121 INFO 01:34:23,836 [solr_data] webapp=/solandra path=/select params={fl=id,score&start=0&q=:&isShard=true&wt=javabin&fsv=true&rows=10&version=2} hits=27733 status=0 QTime=3120 INFO 01:34:24,117 [solr_data] webapp=/solandra path=/select params={fl=id,score&start=0&q=:&isShard=true&wt=javabin&fsv=true&rows=10&version=2} hits=32768 status=0 QTime=3401 INFO 01:34:24,117 GC for ParNew: 213 ms, 1114756088 reclaimed leaving 18086261952 used; max is 34328281088 INFO 01:34:24,491 [solr_data] webapp=/solandra path=/select params={fl=id,score&start=0&q=:&isShard=true&wt=javabin&fsv=true&rows=10&version=2} hits=65104 status=0 QTime=3775 INFO 01:34:24,600 [solr_data] webapp=/solandra path=/select params={fl=id,score&start=0&q=:&isShard=true&wt=javabin&fsv=true&rows=10&version=2} hits=71096 status=0 QTime=3869 INFO 01:34:24,647 [solr_data] webapp=/solandra path=/select params={fl=id,score&start=0&q=:&isShard=true&wt=javabin&fsv=true&rows=10&version=2} hits=46672 status=0 QTime=3931 INFO 01:34:24,834 [solr_data] webapp=/solandra path=/select params={fl=id,score&start=0&q=:&isShard=true&wt=javabin&fsv=true&rows=10&version=2} hits=65121 status=0 QTime=4118 INFO 01:34:25,318 [solr_data] webapp=/solandra path=/select params={fl=id,score&start=0&q=:&isShard=true&wt=javabin&fsv=true&rows=10&version=2} hits=65114 status=0 QTime=4602 INFO 01:34:25,318 [solr_data] webapp=/solandra path=/select params={fl=id,score&start=0&q=:&isShard=true&wt=javabin&fsv=true&rows=10&version=2} hits=42597 status=0 QTime=4587 INFO 01:34:25,318 [solr_data] webapp=/solandra path=/select params={fl=id,score&start=0&q=:&isShard=true&wt=javabin&fsv=true&rows=10&version=2} hits=32767 status=0 QTime=4602 INFO 01:34:25,412 [solr_data] webapp=/solandra path=/select params={fl=id,score&start=0&q=:&isShard=true&wt=javabin&fsv=true&rows=10&version=2} hits=65111 status=0 QTime=4681 INFO 01:34:25,505 [solr_data] webapp=/solandra path=/select params={fl=id,score&start=0&q=:&isShard=true&wt=javabin&fsv=true&rows=10&version=2} hits=58707 status=0 QTime=4789 INFO 01:34:26,348 [solr_data] webapp=/solandra path=/select params={isShard=true&wt=javabin&q=:&ids=00836829,00823158,00817590,00844223,00843426,00823962,00818188,00823961,00834699,00836830&version=2} status=0 QTime=765 INFO 01:34:26,348 [solr_data] webapp=/solandra path=/select/ params={q=:} status=0 QTime=6506
The part of corresponding http response is:
... name="responseHeader"
... name="status">0<
... name="QTime">6506<
... name="response" numFound="758440" ..
"
#NOTE This value should not be changed once documents are indexed #NOTE This value must be a power of 2 solandra.maximum.docs.per.shard = 131072
#The number of index ids to reserve at a time #NOTE this value must be a power of 2 solandra.index.id.reserve.size = 16384
(I did all the changes on fresh Solandra setup before insertion all the documents) affects numFound value.
It could be 20 shards is too much for windows? I updated the defaults in solandra.properties to be ~1M per shard. could you try the latest?
1M per shard is not enough. It started working with 4M docs per shard. Could you implement a general solution for the issue, e.g. provide instruction how to estimate solandra.maximum.docs.per.shard or/and implemet an http error response if a number of documents returned is wrong.
I believe this issue is fixed now can you re-test?
Ok, I'll test on the latest release.
I tested the tjake-Solandra-3af3b0b build. It failed. Having inserted 2000000 documents I executed q=: query. And the response was "result name="response" numFound="1581806" start="0"".