public class Segment
extends java.lang.Object
Modifier and Type | Class and Description |
---|---|
class |
Segment.ReferenceReport
A ReferenceReport object is a container for all references to a specific url.
|
class |
Segment.ReferenceReportCache |
Modifier and Type | Field and Description |
---|---|
static byte[] |
catchallHash |
static java.lang.String |
catchallString |
(package private) static Word |
catchallWord |
static java.lang.String |
citationIndexName |
static ReferenceFactory<CitationReference> |
citationReferenceFactory |
protected IndexTable |
firstSeenIndex |
static java.lang.String |
firstseenIndexName |
protected Fulltext |
fulltext |
private ConcurrentLog |
log |
static int |
lowcachedivisor |
protected IODispatcher |
merger |
private java.io.File |
segmentPath |
static long |
targetFileSize |
protected IndexCell<WordReference> |
termIndex |
static java.lang.String |
termIndexName |
protected IndexCell<CitationReference> |
urlCitationIndex |
static long |
wCacheMaxAge |
static int |
wCacheMaxChunk |
static ByteOrder |
wordOrder |
static ReferenceFactory<WordReference> |
wordReferenceFactory |
static int |
writeBufferSize |
Constructor and Description |
---|
Segment(ConcurrentLog log,
java.io.File segmentPath,
java.io.File archivePath,
CollectionConfiguration collectionConfiguration,
WebgraphConfiguration webgraphConfiguration)
create a new Segment
|
Modifier and Type | Method and Description |
---|---|
int |
citationCount() |
long |
citationSegmentCount() |
void |
clear() |
void |
clearCaches() |
void |
close() |
void |
connectCitation(int entityCacheMaxSize,
long maxFileSize) |
boolean |
connectedCitation() |
boolean |
connectedRWI() |
void |
connectRWI(int entityCacheMaxSize,
long maxFileSize) |
void |
disconnectCitation() |
void |
disconnectRWI() |
IndexTable |
firstSeen() |
Fulltext |
fulltext() |
long |
getFirstSeenTime(byte[] urlhash) |
long |
getLoadTime(java.lang.String urlhash)
get the load time of a resource.
|
java.io.File |
getLocation() |
Segment.ReferenceReportCache |
getReferenceReportCache() |
int |
getWordCountGuess(java.lang.String word)
get a guess about the word count.
|
void |
putDocument(SolrInputDocument queueEntry)
putDocument should not be called directly; instead, put queueEntries to
indexingPutDocumentProcessor
(this must be public, otherwise the WorkflowProcessor - calling by reflection - does not work)
ATTENTION: do not remove! profiling tools will show that this is not called, which is not true (using reflection)
|
int |
removeAllUrlReferences(byte[] urlhash,
LoaderDispatcher loader,
ClientIdentification.Agent agent,
CacheStrategy cacheStrategy)
find all the words in a specific resource and remove the url reference from every word index
finally, delete the url entry
|
void |
removeAllUrlReferences(HandleSet urls,
LoaderDispatcher loader,
ClientIdentification.Agent agent,
CacheStrategy cacheStrategy) |
int |
RWIBufferCount() |
long |
RWICount() |
long |
RWISegmentCount() |
void |
setFirstSeenTime(byte[] urlhash,
long time) |
SolrInputDocument |
storeDocument(DigestURL url,
DigestURL referrerURL,
java.util.Map<java.lang.String,java.util.regex.Pattern> collections,
CrawlProfile crawlProfile,
ResponseHeader responseHeader,
Document document,
Condenser condenser,
SearchEvent searchEvent,
java.lang.String sourceName,
boolean storeToRWI,
java.lang.String proxy,
java.lang.String acceptLanguage) |
void |
storeRWI(byte[] termHash,
WordReference entry) |
void |
storeRWI(ReferenceContainer<WordReference> wordContainer) |
IndexCell<WordReference> |
termIndex() |
IndexCell<CitationReference> |
urlCitation() |
java.util.Iterator<DigestURL> |
urlSelector(MultiProtocolURL stub,
long maxtime,
int maxcount)
discover all urls that start with a given url stub
|
private static java.lang.String |
votedLanguage(DigestURL url,
java.lang.String urlNormalform,
Document document,
Condenser condenser) |
public static final java.lang.String catchallString
public static final byte[] catchallHash
static final Word catchallWord
public static final long wCacheMaxAge
public static final int wCacheMaxChunk
public static final int lowcachedivisor
public static final long targetFileSize
public static final int writeBufferSize
public static final java.lang.String termIndexName
public static final java.lang.String citationIndexName
public static final java.lang.String firstseenIndexName
public static final ReferenceFactory<WordReference> wordReferenceFactory
public static final ReferenceFactory<CitationReference> citationReferenceFactory
public static final ByteOrder wordOrder
private final ConcurrentLog log
private final java.io.File segmentPath
protected final Fulltext fulltext
protected IndexCell<WordReference> termIndex
protected IndexCell<CitationReference> urlCitationIndex
protected IndexTable firstSeenIndex
protected IODispatcher merger
public Segment(ConcurrentLog log, java.io.File segmentPath, java.io.File archivePath, CollectionConfiguration collectionConfiguration, WebgraphConfiguration webgraphConfiguration) throws java.io.IOException
log
- segmentPath
- that should be the path ponting to the directory "SEGMENT"collectionSchema
- java.io.IOException
public boolean connectedRWI()
public void connectRWI(int entityCacheMaxSize, long maxFileSize) throws java.io.IOException
java.io.IOException
public void disconnectRWI()
public boolean connectedCitation()
public void connectCitation(int entityCacheMaxSize, long maxFileSize) throws java.io.IOException
java.io.IOException
public void disconnectCitation()
public int citationCount()
public long citationSegmentCount()
public Fulltext fulltext()
public IndexCell<WordReference> termIndex()
public IndexCell<CitationReference> urlCitation()
public IndexTable firstSeen()
public Segment.ReferenceReportCache getReferenceReportCache()
public long RWICount()
public long RWISegmentCount()
public int RWIBufferCount()
public int getWordCountGuess(java.lang.String word)
word
- public void setFirstSeenTime(byte[] urlhash, long time)
public long getFirstSeenTime(byte[] urlhash)
public long getLoadTime(java.lang.String urlhash) throws java.io.IOException
urlHash
- java.io.IOException
public java.util.Iterator<DigestURL> urlSelector(MultiProtocolURL stub, long maxtime, int maxcount)
stub
- public void clear()
public void clearCaches()
public java.io.File getLocation()
public void close()
private static java.lang.String votedLanguage(DigestURL url, java.lang.String urlNormalform, Document document, Condenser condenser)
public void storeRWI(ReferenceContainer<WordReference> wordContainer) throws java.io.IOException, SpaceExceededException
java.io.IOException
SpaceExceededException
public void storeRWI(byte[] termHash, WordReference entry) throws java.io.IOException, SpaceExceededException
java.io.IOException
SpaceExceededException
public void putDocument(SolrInputDocument queueEntry)
queueEntry
- java.io.IOException
public SolrInputDocument storeDocument(DigestURL url, DigestURL referrerURL, java.util.Map<java.lang.String,java.util.regex.Pattern> collections, CrawlProfile crawlProfile, ResponseHeader responseHeader, Document document, Condenser condenser, SearchEvent searchEvent, java.lang.String sourceName, boolean storeToRWI, java.lang.String proxy, java.lang.String acceptLanguage)
public void removeAllUrlReferences(HandleSet urls, LoaderDispatcher loader, ClientIdentification.Agent agent, CacheStrategy cacheStrategy)
public int removeAllUrlReferences(byte[] urlhash, LoaderDispatcher loader, ClientIdentification.Agent agent, CacheStrategy cacheStrategy)
urlhash
- the hash of the url that shall be removedloader
- cacheStrategy
-