public class CrawlProfile
extends java.util.concurrent.ConcurrentHashMap<java.lang.String,java.lang.String>
implements java.util.Map<java.lang.String,java.lang.String>
Modifier and Type | Class and Description |
---|---|
static class |
CrawlProfile.CrawlAttribute |
Modifier and Type | Field and Description |
---|---|
private java.util.Map<java.lang.String,java.util.regex.Pattern> |
cmap |
static java.lang.String |
CRAWL_PROFILE_PUSH_STUB |
private java.util.regex.Pattern |
crawleripmustmatch |
private java.util.regex.Pattern |
crawleripmustnotmatch |
private java.util.regex.Pattern |
crawlernodepthlimitmatch |
private java.util.regex.Pattern |
crawlerurlmustmatch |
private java.util.regex.Pattern |
crawlerurlmustnotmatch |
private java.util.Map<java.lang.String,java.util.concurrent.atomic.AtomicInteger> |
doms |
private java.util.regex.Pattern |
indexcontentmustmatch |
private java.util.regex.Pattern |
indexcontentmustnotmatch |
private java.util.regex.Pattern |
indexurlmustmatch |
private java.util.regex.Pattern |
indexurlmustnotmatch |
static java.util.regex.Pattern |
MATCH_ALL_PATTERN |
static java.lang.String |
MATCH_ALL_STRING |
static java.util.regex.Pattern |
MATCH_NEVER_PATTERN |
static java.lang.String |
MATCH_NEVER_STRING |
private VocabularyScraper |
scraper |
private static long |
serialVersionUID |
private java.util.regex.Pattern |
snapshotsMustnotmatch |
Constructor and Description |
---|
CrawlProfile(java.util.Map<java.lang.String,java.lang.String> ext)
Constructor which creates a CrawlProfile from values in a Map.
|
CrawlProfile(java.lang.String name,
java.lang.String crawlerUrlMustMatch,
java.lang.String crawlerUrlMustNotMatch,
java.lang.String crawlerIpMustMatch,
java.lang.String crawlerIpMustNotMatch,
java.lang.String crawlerCountryMustMatch,
java.lang.String crawlerNoDepthLimitMatch,
java.lang.String indexUrlMustMatch,
java.lang.String indexUrlMustNotMatch,
java.lang.String indexContentMustMatch,
java.lang.String indexContentMustNotMatch,
int depth,
boolean directDocByURL,
java.util.Date recrawlIfOlder,
int domMaxPages,
boolean crawlingQ,
boolean followFrames,
boolean obeyHtmlRobotsNoindex,
boolean obeyHtmlRobotsNofollow,
boolean indexText,
boolean indexMedia,
boolean storeHTCache,
boolean remoteIndexing,
int snapshotsMaxDepth,
boolean snapshotsLoadImage,
boolean snapshotsReplaceOld,
java.lang.String snapshotsMustnotmatch,
CacheStrategy cacheStrategy,
java.lang.String collections,
java.lang.String userAgentName,
VocabularyScraper scraper,
int timezoneOffset)
Constructor which creates CrawlPofile from parameters.
|
Modifier and Type | Method and Description |
---|---|
CacheStrategy |
cacheStrategy() |
java.lang.String |
collectionName()
create a name that takes the collection as name if this is not "user".
|
static java.util.Map<java.lang.String,java.util.regex.Pattern> |
collectionParser(java.lang.String collectionString) |
java.util.Map<java.lang.String,java.util.regex.Pattern> |
collections()
get the collections for this crawl
|
java.lang.String[] |
countryMustMatchList()
get the list of countries that must match for the locations of the URLs IPs
|
java.util.regex.Pattern |
crawlerNoDepthLimitMatchPattern()
If the regex matches with the url, then there is no depth limit on the crawl (it overrides depth == 0)
|
boolean |
crawlingQ() |
int |
depth()
Gets depth of crawl job (or height of the tree which will be
created by the crawler).
|
boolean |
directDocByURL() |
void |
domInc(java.lang.String domain) |
int |
domMaxPages() |
private java.lang.String |
domName(boolean attr,
int index) |
boolean |
followFrames() |
ClientIdentification.Agent |
getAgent() |
java.util.concurrent.atomic.AtomicInteger |
getCount(java.lang.String domain) |
static java.util.Date |
getRecrawlDate(long oldTimeMinutes)
get a recrawl date for a given age in minutes
|
java.lang.String |
handle()
Gets handle of the CrawlProfile.
|
java.util.regex.Pattern |
indexContentMustMatchPattern()
Gets the regex which must be matched by URLs in order to be indexed.
|
java.util.regex.Pattern |
indexContentMustNotMatchPattern()
Gets the regex which must not be matched by URLs in order to be indexed.
|
boolean |
indexMedia() |
boolean |
indexText() |
java.util.regex.Pattern |
indexUrlMustMatchPattern()
Gets the regex which must be matched by URLs in order to be indexed.
|
java.util.regex.Pattern |
indexUrlMustNotMatchPattern()
Gets the regex which must not be matched by URLs in order to be indexed.
|
java.util.regex.Pattern |
ipMustMatchPattern()
Gets the regex which must be matched by IPs in order to be crawled.
|
java.util.regex.Pattern |
ipMustNotMatchPattern()
Gets the regex which must not be matched by IPs in order to be crawled.
|
boolean |
isPushCrawlProfile() |
static java.lang.String |
mustMatchFilterFullDomain(MultiProtocolURL url) |
static java.lang.String |
mustMatchSubpath(MultiProtocolURL url) |
java.lang.String |
name()
Gets the name of the CrawlProfile.
|
boolean |
obeyHtmlRobotsNofollow() |
boolean |
obeyHtmlRobotsNoindex() |
void |
put(java.lang.String key,
boolean value)
Adds a parameter to CrawlProfile.
|
private void |
put(java.lang.String key,
int value)
Adds a parameter to CrawlProfile.
|
private void |
put(java.lang.String key,
long value)
Adds a parameter to CrawlProfile.
|
void |
putProfileEntry(java.lang.String CRAWL_PROFILE_PREFIX,
serverObjects prop,
boolean active,
boolean dark,
int count,
int domlistlength) |
long |
recrawlIfOlder()
Gets the minimum date that an entry must have to be re-crawled.
|
boolean |
remoteIndexing() |
VocabularyScraper |
scraper() |
void |
setCacheStrategy(CacheStrategy newStrategy) |
static java.lang.String |
siteFilter(java.util.Collection<? extends MultiProtocolURL> urls) |
boolean |
snapshotLoadImage() |
int |
snapshotMaxdepth() |
boolean |
snapshotReplaceold() |
java.util.regex.Pattern |
snapshotsMustnotmatch() |
boolean |
storeHTCache() |
static java.lang.String |
subpathFilter(java.util.Collection<? extends MultiProtocolURL> urls) |
int |
timezoneOffset() |
java.util.regex.Pattern |
urlMustMatchPattern()
Gets the regex which must be matched by URLs in order to be crawled.
|
java.util.regex.Pattern |
urlMustNotMatchPattern()
Gets the regex which must not be matched by URLs in order to be crawled.
|
clear, contains, containsKey, containsValue, elements, entrySet, get, isEmpty, keys, keySet, put, putAll, putIfAbsent, remove, remove, replace, replace, size, values
private static final long serialVersionUID
public static final java.lang.String MATCH_ALL_STRING
public static final java.lang.String MATCH_NEVER_STRING
public static final java.util.regex.Pattern MATCH_ALL_PATTERN
public static final java.util.regex.Pattern MATCH_NEVER_PATTERN
public static final java.lang.String CRAWL_PROFILE_PUSH_STUB
private java.util.regex.Pattern crawlerurlmustmatch
private java.util.regex.Pattern crawlerurlmustnotmatch
private java.util.regex.Pattern crawleripmustmatch
private java.util.regex.Pattern crawleripmustnotmatch
private java.util.regex.Pattern crawlernodepthlimitmatch
private java.util.regex.Pattern indexurlmustmatch
private java.util.regex.Pattern indexurlmustnotmatch
private java.util.regex.Pattern indexcontentmustmatch
private java.util.regex.Pattern indexcontentmustnotmatch
private java.util.regex.Pattern snapshotsMustnotmatch
private final java.util.Map<java.lang.String,java.util.concurrent.atomic.AtomicInteger> doms
private final VocabularyScraper scraper
private java.util.Map<java.lang.String,java.util.regex.Pattern> cmap
public CrawlProfile(java.lang.String name, java.lang.String crawlerUrlMustMatch, java.lang.String crawlerUrlMustNotMatch, java.lang.String crawlerIpMustMatch, java.lang.String crawlerIpMustNotMatch, java.lang.String crawlerCountryMustMatch, java.lang.String crawlerNoDepthLimitMatch, java.lang.String indexUrlMustMatch, java.lang.String indexUrlMustNotMatch, java.lang.String indexContentMustMatch, java.lang.String indexContentMustNotMatch, int depth, boolean directDocByURL, java.util.Date recrawlIfOlder, int domMaxPages, boolean crawlingQ, boolean followFrames, boolean obeyHtmlRobotsNoindex, boolean obeyHtmlRobotsNofollow, boolean indexText, boolean indexMedia, boolean storeHTCache, boolean remoteIndexing, int snapshotsMaxDepth, boolean snapshotsLoadImage, boolean snapshotsReplaceOld, java.lang.String snapshotsMustnotmatch, CacheStrategy cacheStrategy, java.lang.String collections, java.lang.String userAgentName, VocabularyScraper scraper, int timezoneOffset)
name
- name of the crawl profilestartURL
- root URL of the crawlcrawlerUrlMustMatch
- URLs which do not match this regex will be ignored in the crawlercrawlerUrlMustNotMatch
- URLs which match this regex will be ignored in the crawlercrawlerIpMustMatch
- IPs from URLs which do not match this regex will be ignored in the crawlercrawlerIpMustNotMatch
- IPs from URLs which match this regex will be ignored in the crawlercrawlerCountryMustMatch
- URLs from a specific country must matchcrawlerNoDepthLimitMatch
- if matches, no depth limit is applied to the crawlerindexUrlMustMatch
- URLs which do not match this regex will be ignored for indexingindexUrlMustNotMatch
- URLs which match this regex will be ignored for indexingindexContentMustMatch
- content which do not match this regex will be ignored for indexingindexContentMustNotMatch
- content which match this regex will be ignored for indexingdepth
- height of the tree which will be created by the crawlerdirectDocByURL
- if true, then linked documents that cannot be parsed are indexed as documentrecrawlIfOlder
- documents which have been indexed in the past will be indexed again if they are older than the given datedomMaxPages
- maximum number from one domain which will be indexedcrawlingQ
- true if URLs containing questionmarks shall be indexedindexText
- true if text content of URL shall be indexedindexMedia
- true if media content of URL shall be indexedstoreHTCache
- true if content chall be kept in cache after indexingremoteIndexing
- true if part of the crawl job shall be distributedsnapshotsMaxDepth
- if the current crawl depth is equal or below that given depth, a snapshot is generatedsnapshotsLoadImage
- true if graphical (== pdf) shapshots shall be madesnapshotsReplaceOld
- true if snapshots shall not be historizedsnapshotsMustnotmatch
- a regular expression; if it matches on the url, the snapshot is not generatedxsstopw
- true if static stop words shall be ignoredxdstopw
- true if dynamic stop words shall be ignoredxpstopw
- true if parent stop words shall be ignoredcacheStrategy
- determines if and how cache is used loading contentcollections
- a comma-separated list of tags which are attached to index entriesuserAgentName
- the profile name of the user agent to be usedscraper
- a scraper for vocabulariestimezoneOffset
- the time offset in minutes for scraped dates in text without time zonepublic CrawlProfile(java.util.Map<java.lang.String,java.lang.String> ext)
ext
- contains valuespublic VocabularyScraper scraper()
public void domInc(java.lang.String domain)
private java.lang.String domName(boolean attr, int index)
public ClientIdentification.Agent getAgent()
public java.util.concurrent.atomic.AtomicInteger getCount(java.lang.String domain)
public final void put(java.lang.String key, boolean value)
key
- name of the parametervalue
- values if the parameterprivate final void put(java.lang.String key, int value)
key
- name of the parametervalue
- values if the parameterprivate final void put(java.lang.String key, long value)
key
- name of the parametervalue
- values if the parameterpublic java.lang.String handle()
public java.util.Map<java.lang.String,java.util.regex.Pattern> collections()
public static java.util.Map<java.lang.String,java.util.regex.Pattern> collectionParser(java.lang.String collectionString)
public java.lang.String name()
public java.lang.String collectionName()
public java.util.regex.Pattern urlMustMatchPattern()
public java.util.regex.Pattern urlMustNotMatchPattern()
public java.util.regex.Pattern ipMustMatchPattern()
public java.util.regex.Pattern ipMustNotMatchPattern()
public java.lang.String[] countryMustMatchList()
public java.util.regex.Pattern crawlerNoDepthLimitMatchPattern()
public java.util.regex.Pattern indexUrlMustMatchPattern()
public java.util.regex.Pattern indexUrlMustNotMatchPattern()
public java.util.regex.Pattern indexContentMustMatchPattern()
public java.util.regex.Pattern indexContentMustNotMatchPattern()
public int depth()
public boolean directDocByURL()
public CacheStrategy cacheStrategy()
public void setCacheStrategy(CacheStrategy newStrategy)
public long recrawlIfOlder()
public int domMaxPages()
public boolean crawlingQ()
public boolean followFrames()
public boolean obeyHtmlRobotsNoindex()
public boolean obeyHtmlRobotsNofollow()
public boolean indexText()
public boolean indexMedia()
public boolean storeHTCache()
public boolean remoteIndexing()
public int snapshotMaxdepth()
public boolean snapshotLoadImage()
public boolean snapshotReplaceold()
public java.util.regex.Pattern snapshotsMustnotmatch()
public int timezoneOffset()
public static java.util.Date getRecrawlDate(long oldTimeMinutes)
oldTimeMinutes
- public static java.lang.String siteFilter(java.util.Collection<? extends MultiProtocolURL> urls)
public static java.lang.String mustMatchFilterFullDomain(MultiProtocolURL url)
public static java.lang.String subpathFilter(java.util.Collection<? extends MultiProtocolURL> urls)
public static java.lang.String mustMatchSubpath(MultiProtocolURL url)
public boolean isPushCrawlProfile()
public void putProfileEntry(java.lang.String CRAWL_PROFILE_PREFIX, serverObjects prop, boolean active, boolean dark, int count, int domlistlength)