public class CollectionConfiguration extends SchemaConfiguration implements java.io.Serializable
Modifier and Type | Class and Description |
---|---|
private static class |
CollectionConfiguration.CRHost
The CRHost class is a container for all ranking values of a specific host.
|
private static class |
CollectionConfiguration.CRV |
static class |
CollectionConfiguration.FailDoc
encode a string containing attributes from anchor rel properties binary:
bit 0: "me" contained in rel
bit 1: "nofollow" contained in rel
|
static class |
CollectionConfiguration.SolrVector
a SolrVector is a SolrInputDocument with the ability
to store also the webgraph that is associated with
the web document in the Solr document.
|
static class |
CollectionConfiguration.Subgraph |
Configuration.Entry
Modifier and Type | Field and Description |
---|---|
private static java.util.Set<java.lang.String> |
omitFields |
static java.lang.String |
postprocessingActivity |
static int |
postprocessingCollection1Count |
static boolean |
postprocessingRunning |
static long |
postprocessingStartTime |
static int |
postprocessingWebgraphCount |
private java.util.ArrayList<Ranking> |
rankings |
private static long |
serialVersionUID |
static boolean |
UNIQUE_HEURISTIC_PREFER_HTTPS |
static boolean |
UNIQUE_HEURISTIC_PREFER_WWWPREFIX |
lazy
Constructor and Description |
---|
CollectionConfiguration(java.io.File configurationFile,
boolean lazy)
initialize the schema with a given configuration file
the configuration file simply contains a list of lines with keywords
or keyword = value lines (while value is a custom Solr field name
|
Modifier and Type | Method and Description |
---|---|
private static void |
accText(java.lang.StringBuilder sb,
java.lang.String text) |
java.lang.String |
addURIAttributes(SolrInputDocument doc,
boolean allAttr,
DigestURL digestURL)
add uri attributes to solr document
|
java.lang.String[] |
allFields() |
static java.lang.String |
collection1query(Segment segment,
java.lang.String harvestkey) |
void |
commit()
save configuration to file and update enum SolrFields
|
void |
enrich(SolrInputDocument doc,
java.util.List<java.lang.String> synonyms,
java.util.Map<java.lang.String,java.util.Set<java.lang.String>> genericFacets)
attach additional information to the document to enable navigation features
|
static boolean |
enrichSubgraph(CollectionConfiguration.Subgraph subgraph,
DigestURL source_url,
AnchorURL target_url) |
Ranking |
getRanking(int idx) |
Ranking |
getRanking(java.lang.String name) |
static java.util.List<java.lang.String> |
indexedList2protocolList(java.util.Collection<java.lang.Object> iplist,
int dimension) |
SolrInputDocument |
metadata2solr(URIMetadataNode md) |
void |
postprocessing_doublecontent(Segment segment,
java.util.Set<java.lang.String> uniqueURLs,
SolrDocument doc,
SolrInputDocument sid,
DigestURL url) |
void |
postprocessing_http_unique(Segment segment,
SolrDocument doc,
SolrInputDocument sid,
DigestURL url) |
boolean |
postprocessing_references(Segment.ReferenceReportCache rrCache,
SolrInputDocument sid,
DigestURL url,
java.util.Map<java.lang.String,java.lang.Long> hostExtentCount) |
void |
postprocessing_www_unique(Segment segment,
SolrDocument doc,
SolrInputDocument sid,
DigestURL url) |
int |
postprocessing(Segment segment,
Segment.ReferenceReportCache rrCache,
java.lang.String harvestkey,
boolean byPartialUpdate)
post-processing steps for all entries that have a process tag assigned
|
static java.util.List<java.lang.String> |
protocolList2indexedList(java.util.List<java.lang.String> protocol)
this method compresses a list of protocol names to an indexed list.
|
private void |
set_unique_flag(CollectionSchema field,
SolrDocument doc,
SolrInputDocument sid,
SolrDocument d) |
SolrDocument |
toSolrDocument(SolrInputDocument doc) |
SolrInputDocument |
toSolrInputDocument(SolrDocument doc) |
static java.lang.String |
webgraphquery(Segment segment,
java.lang.String harvestkey) |
CollectionConfiguration.SolrVector |
yacy2solr(Segment segment,
java.util.Map<java.lang.String,java.util.regex.Pattern> collections,
ResponseHeader responseHeader,
Document document,
Condenser condenser,
DigestURL referrerURL,
java.lang.String language,
boolean setUnique,
WebgraphConfiguration webgraph,
java.lang.String sourceName) |
add, add, add, add, add, add, add, add, add, contains, fill, getDate, remove, remove, toSolrDocument, toSolrInputDocument
add, add, add, contains, containsDisabled, entryIterator, main
ceilingEntry, ceilingKey, clear, clone, comparator, containsKey, containsValue, descendingKeySet, descendingMap, entrySet, firstEntry, firstKey, floorEntry, floorKey, get, headMap, headMap, higherEntry, higherKey, keySet, lastEntry, lastKey, lowerEntry, lowerKey, navigableKeySet, pollFirstEntry, pollLastEntry, put, putAll, remove, size, subMap, subMap, tailMap, tailMap, values
private static final long serialVersionUID
public static boolean UNIQUE_HEURISTIC_PREFER_HTTPS
public static boolean UNIQUE_HEURISTIC_PREFER_WWWPREFIX
private final java.util.ArrayList<Ranking> rankings
private static final java.util.Set<java.lang.String> omitFields
public static boolean postprocessingRunning
public static java.lang.String postprocessingActivity
public static long postprocessingStartTime
public static int postprocessingCollection1Count
public static int postprocessingWebgraphCount
public CollectionConfiguration(java.io.File configurationFile, boolean lazy) throws java.io.IOException
configurationFile
- java.io.IOException
public java.lang.String[] allFields()
public Ranking getRanking(int idx)
public Ranking getRanking(java.lang.String name)
name
- The name of the ranking to get.public void commit() throws java.io.IOException
commit
in class Configuration
java.io.IOException
public SolrInputDocument toSolrInputDocument(SolrDocument doc)
toSolrInputDocument
in class SchemaConfiguration
public SolrDocument toSolrDocument(SolrInputDocument doc)
public java.lang.String addURIAttributes(SolrInputDocument doc, boolean allAttr, DigestURL digestURL)
doc
- allAttr
- digestURL
- doctype
- public SolrInputDocument metadata2solr(URIMetadataNode md)
private static void accText(java.lang.StringBuilder sb, java.lang.String text)
public static boolean enrichSubgraph(CollectionConfiguration.Subgraph subgraph, DigestURL source_url, AnchorURL target_url)
public CollectionConfiguration.SolrVector yacy2solr(Segment segment, java.util.Map<java.lang.String,java.util.regex.Pattern> collections, ResponseHeader responseHeader, Document document, Condenser condenser, DigestURL referrerURL, java.lang.String language, boolean setUnique, WebgraphConfiguration webgraph, java.lang.String sourceName)
public void enrich(SolrInputDocument doc, java.util.List<java.lang.String> synonyms, java.util.Map<java.lang.String,java.util.Set<java.lang.String>> genericFacets)
doc
- the document to be enrichedsynonyms
- a list of synonyms detected for the text contentgenericFacets
- a map where the key is the navigator name and the value is the set of attributes namespublic static final java.lang.String collection1query(Segment segment, java.lang.String harvestkey)
public static final java.lang.String webgraphquery(Segment segment, java.lang.String harvestkey)
public int postprocessing(Segment segment, Segment.ReferenceReportCache rrCache, java.lang.String harvestkey, boolean byPartialUpdate)
connector
- urlCitation
- public void postprocessing_http_unique(Segment segment, SolrDocument doc, SolrInputDocument sid, DigestURL url)
public void postprocessing_www_unique(Segment segment, SolrDocument doc, SolrInputDocument sid, DigestURL url)
private void set_unique_flag(CollectionSchema field, SolrDocument doc, SolrInputDocument sid, SolrDocument d)
public void postprocessing_doublecontent(Segment segment, java.util.Set<java.lang.String> uniqueURLs, SolrDocument doc, SolrInputDocument sid, DigestURL url)
public boolean postprocessing_references(Segment.ReferenceReportCache rrCache, SolrInputDocument sid, DigestURL url, java.util.Map<java.lang.String,java.lang.Long> hostExtentCount)
public static java.util.List<java.lang.String> protocolList2indexedList(java.util.List<java.lang.String> protocol)
entries is produced, where is an index pointing to the original index of the protocol entry and
is the protocol entry itself. The entry is formatted as a 3-digit decimal number with leading zero digits.
protocol
- public static java.util.List<java.lang.String> indexedList2protocolList(java.util.Collection<java.lang.Object> iplist, int dimension)