001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.lucene.demo; 018 019import java.io.BufferedReader; 020import java.io.IOException; 021import java.io.InputStream; 022import java.io.InputStreamReader; 023import java.nio.charset.StandardCharsets; 024import java.nio.file.FileVisitResult; 025import java.nio.file.Files; 026import java.nio.file.Path; 027import java.nio.file.Paths; 028import java.nio.file.SimpleFileVisitor; 029import java.nio.file.attribute.BasicFileAttributes; 030import java.util.Date; 031import java.util.Objects; 032import org.apache.lucene.analysis.Analyzer; 033import org.apache.lucene.analysis.standard.StandardAnalyzer; 034import org.apache.lucene.demo.knn.DemoEmbeddings; 035import org.apache.lucene.demo.knn.KnnVectorDict; 036import org.apache.lucene.document.Document; 037import org.apache.lucene.document.Field; 038import org.apache.lucene.document.KeywordField; 039import org.apache.lucene.document.KnnFloatVectorField; 040import org.apache.lucene.document.LongField; 041import org.apache.lucene.document.TextField; 042import org.apache.lucene.index.DirectoryReader; 043import org.apache.lucene.index.IndexReader; 044import org.apache.lucene.index.IndexWriter; 045import org.apache.lucene.index.IndexWriterConfig; 046import org.apache.lucene.index.IndexWriterConfig.OpenMode; 047import org.apache.lucene.index.Term; 048import org.apache.lucene.index.VectorSimilarityFunction; 049import org.apache.lucene.store.Directory; 050import org.apache.lucene.store.FSDirectory; 051import org.apache.lucene.util.IOUtils; 052 053/** 054 * Index all text files under a directory. 055 * 056 * <p>This is a command-line application demonstrating simple Lucene indexing. Run it with no 057 * command-line arguments for usage information. 058 */ 059public class IndexFiles implements AutoCloseable { 060 static final String KNN_DICT = "knn-dict"; 061 062 // Calculates embedding vectors for KnnVector search 063 private final DemoEmbeddings demoEmbeddings; 064 private final KnnVectorDict vectorDict; 065 066 private IndexFiles(KnnVectorDict vectorDict) throws IOException { 067 if (vectorDict != null) { 068 this.vectorDict = vectorDict; 069 demoEmbeddings = new DemoEmbeddings(vectorDict); 070 } else { 071 this.vectorDict = null; 072 demoEmbeddings = null; 073 } 074 } 075 076 /** Index all text files under a directory. */ 077 public static void main(String[] args) throws Exception { 078 String usage = 079 "java org.apache.lucene.demo.IndexFiles" 080 + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update] [-knn_dict DICT_PATH]\n\n" 081 + "This indexes the documents in DOCS_PATH, creating a Lucene index " 082 + "in INDEX_PATH that can be searched with SearchFiles\n" 083 + "IF DICT_PATH contains a KnnVector dictionary, the index will also support KnnVector search"; 084 String indexPath = "index"; 085 String docsPath = null; 086 String vectorDictSource = null; 087 boolean create = true; 088 for (int i = 0; i < args.length; i++) { 089 switch (args[i]) { 090 case "-index": 091 indexPath = args[++i]; 092 break; 093 case "-docs": 094 docsPath = args[++i]; 095 break; 096 case "-knn_dict": 097 vectorDictSource = args[++i]; 098 break; 099 case "-update": 100 create = false; 101 break; 102 case "-create": 103 create = true; 104 break; 105 default: 106 throw new IllegalArgumentException("unknown parameter " + args[i]); 107 } 108 } 109 110 if (docsPath == null) { 111 System.err.println("Usage: " + usage); 112 System.exit(1); 113 } 114 115 final Path docDir = Paths.get(docsPath); 116 if (!Files.isReadable(docDir)) { 117 System.out.println( 118 "Document directory '" 119 + docDir.toAbsolutePath() 120 + "' does not exist or is not readable, please check the path"); 121 System.exit(1); 122 } 123 124 Date start = new Date(); 125 try { 126 System.out.println("Indexing to directory '" + indexPath + "'..."); 127 128 Directory dir = FSDirectory.open(Paths.get(indexPath)); 129 Analyzer analyzer = new StandardAnalyzer(); 130 IndexWriterConfig iwc = new IndexWriterConfig(analyzer); 131 132 if (create) { 133 // Create a new index in the directory, removing any 134 // previously indexed documents: 135 iwc.setOpenMode(OpenMode.CREATE); 136 } else { 137 // Add new documents to an existing index: 138 iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); 139 } 140 141 // Optional: for better indexing performance, if you 142 // are indexing many documents, increase the RAM 143 // buffer. But if you do this, increase the max heap 144 // size to the JVM (e.g. add -Xmx512m or -Xmx1g): 145 // 146 // iwc.setRAMBufferSizeMB(256.0); 147 148 KnnVectorDict vectorDictInstance = null; 149 long vectorDictSize = 0; 150 if (vectorDictSource != null) { 151 KnnVectorDict.build(Paths.get(vectorDictSource), dir, KNN_DICT); 152 vectorDictInstance = new KnnVectorDict(dir, KNN_DICT); 153 vectorDictSize = vectorDictInstance.ramBytesUsed(); 154 } 155 156 try (IndexWriter writer = new IndexWriter(dir, iwc); 157 IndexFiles indexFiles = new IndexFiles(vectorDictInstance)) { 158 indexFiles.indexDocs(writer, docDir); 159 160 // NOTE: if you want to maximize search performance, 161 // you can optionally call forceMerge here. This can be 162 // a terribly costly operation, so generally it's only 163 // worth it when your index is relatively static (ie 164 // you're done adding documents to it): 165 // 166 // writer.forceMerge(1); 167 } finally { 168 IOUtils.close(vectorDictInstance); 169 } 170 171 Date end = new Date(); 172 try (IndexReader reader = DirectoryReader.open(dir)) { 173 System.out.println( 174 "Indexed " 175 + reader.numDocs() 176 + " documents in " 177 + (end.getTime() - start.getTime()) 178 + " ms"); 179 if (Objects.isNull(vectorDictSource) == false 180 && reader.numDocs() > 100 181 && vectorDictSize < 1_000_000 182 && System.getProperty("smoketester") == null) { 183 throw new RuntimeException( 184 "Are you (ab)using the toy vector dictionary? See the package javadocs to understand why you got this exception."); 185 } 186 } 187 } catch (IOException e) { 188 System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); 189 } 190 } 191 192 /** 193 * Indexes the given file using the given writer, or if a directory is given, recurses over files 194 * and directories found under the given directory. 195 * 196 * <p>NOTE: This method indexes one document per input file. This is slow. For good throughput, 197 * put multiple documents into your input file(s). An example of this is in the benchmark module, 198 * which can create "line doc" files, one document per line, using the <a 199 * href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" 200 * >WriteLineDocTask</a>. 201 * 202 * @param writer Writer to the index where the given file/dir info will be stored 203 * @param path The file to index, or the directory to recurse into to find files to index 204 * @throws IOException If there is a low-level I/O error 205 */ 206 void indexDocs(final IndexWriter writer, Path path) throws IOException { 207 if (Files.isDirectory(path)) { 208 Files.walkFileTree( 209 path, 210 new SimpleFileVisitor<>() { 211 @Override 212 public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) { 213 try { 214 indexDoc(writer, file, attrs.lastModifiedTime().toMillis()); 215 } catch ( 216 @SuppressWarnings("unused") 217 IOException ignore) { 218 ignore.printStackTrace(System.err); 219 // don't index files that can't be read. 220 } 221 return FileVisitResult.CONTINUE; 222 } 223 }); 224 } else { 225 indexDoc(writer, path, Files.getLastModifiedTime(path).toMillis()); 226 } 227 } 228 229 /** Indexes a single document */ 230 void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException { 231 try (InputStream stream = Files.newInputStream(file)) { 232 // make a new, empty document 233 Document doc = new Document(); 234 235 // Add the path of the file as a field named "path". Use a 236 // field that is indexed (i.e. searchable), but don't tokenize 237 // the field into separate words and don't index term frequency 238 // or positional information: 239 doc.add(new KeywordField("path", file.toString(), Field.Store.YES)); 240 241 // Add the last modified date of the file a field named "modified". 242 // Use a LongField that is indexed with points and doc values, and is efficient 243 // for both filtering (LongField#newRangeQuery) and sorting 244 // (LongField#newSortField). This indexes to millisecond resolution, which 245 // is often too fine. You could instead create a number based on 246 // year/month/day/hour/minutes/seconds, down the resolution you require. 247 // For example the long value 2011021714 would mean 248 // February 17, 2011, 2-3 PM. 249 doc.add(new LongField("modified", lastModified, Field.Store.NO)); 250 251 // Add the contents of the file to a field named "contents". Specify a Reader, 252 // so that the text of the file is tokenized and indexed, but not stored. 253 // Note that FileReader expects the file to be in UTF-8 encoding. 254 // If that's not the case searching for special characters will fail. 255 doc.add( 256 new TextField( 257 "contents", 258 new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)))); 259 260 if (demoEmbeddings != null) { 261 try (InputStream in = Files.newInputStream(file)) { 262 float[] vector = 263 demoEmbeddings.computeEmbedding( 264 new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8))); 265 doc.add( 266 new KnnFloatVectorField( 267 "contents-vector", vector, VectorSimilarityFunction.DOT_PRODUCT)); 268 } 269 } 270 271 if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { 272 // New index, so we just add the document (no old document can be there): 273 System.out.println("adding " + file); 274 writer.addDocument(doc); 275 } else { 276 // Existing index (an old copy of this document may have been indexed) so 277 // we use updateDocument instead to replace the old one matching the exact 278 // path, if present: 279 System.out.println("updating " + file); 280 writer.updateDocument(new Term("path", file.toString()), doc); 281 } 282 } 283 } 284 285 @Override 286 public void close() throws IOException { 287 IOUtils.close(vectorDict); 288 } 289}