public final class CrawlStacker
extends java.lang.Object
Modifier and Type | Field and Description |
---|---|
private boolean |
acceptGlobalURLs |
private boolean |
acceptLocalURLs |
private CrawlSwitchboard |
crawler |
private FilterEngine |
domainList |
static java.lang.String |
ERROR_MATCH_WITH_MUST_NOT_MATCH_FILTER |
static java.lang.String |
ERROR_NO_MATCH_MUST_MATCH_FILTER |
private Segment |
indexSegment |
private static ConcurrentLog |
log |
CrawlQueues |
nextQueue |
private SeedDB |
peers |
private WorkflowProcessor<Request> |
requestQueue |
private RobotsTxt |
robots |
Constructor and Description |
---|
CrawlStacker(RobotsTxt robots,
CrawlQueues cq,
CrawlSwitchboard cs,
Segment indexSegment,
SeedDB peers,
boolean acceptLocalURLs,
boolean acceptGlobalURLs,
FilterEngine domainList) |
Modifier and Type | Method and Description |
---|---|
boolean |
acceptGlobalURLs() |
boolean |
acceptLocalURLs() |
void |
announceClose() |
java.lang.String |
checkAcceptanceChangeable(DigestURL url,
CrawlProfile profile,
int depth)
Test if an url shall be accepted using attributes that are defined by a crawl start but can be changed during a crawl.
|
java.lang.String |
checkAcceptanceInitially(DigestURL url,
CrawlProfile profile)
Test if an url shall be accepted for crawl using attributes that are consistent for the whole crawl
These tests are incomplete and must be followed with an checkAcceptanceChangeable - test.
|
void |
clear() |
void |
close() |
private void |
enqueueEntries(byte[] initiator,
java.lang.String profileHandle,
java.util.List<AnchorURL> hyperlinks,
boolean replace,
int timezoneOffset) |
void |
enqueueEntriesAsynchronous(byte[] initiator,
java.lang.String profileHandle,
java.util.List<AnchorURL> hyperlinks,
int timezoneOffset) |
void |
enqueueEntriesFTP(byte[] initiator,
java.lang.String profileHandle,
java.lang.String host,
int port,
java.lang.String user,
java.lang.String pw,
boolean replace,
int timezoneOffset) |
void |
enqueueEntry(Request entry) |
boolean |
isEmpty() |
Request |
job(Request entry) |
int |
size() |
java.lang.String |
stackCrawl(Request entry)
stacks a crawl item.
|
java.lang.String |
stackSimpleCrawl(DigestURL url)
simple method to add one url as crawljob
|
java.lang.String |
urlInAcceptedDomain(DigestURL url)
Test a url if it can be used for crawling/indexing
This mainly checks if the url is in the declared domain (local/global)
|
java.lang.String |
urlInAcceptedDomainHash(byte[] urlhash) |
public static java.lang.String ERROR_NO_MATCH_MUST_MATCH_FILTER
public static java.lang.String ERROR_MATCH_WITH_MUST_NOT_MATCH_FILTER
private static final ConcurrentLog log
private final RobotsTxt robots
private final WorkflowProcessor<Request> requestQueue
public final CrawlQueues nextQueue
private final CrawlSwitchboard crawler
private final Segment indexSegment
private final SeedDB peers
private final boolean acceptLocalURLs
private final boolean acceptGlobalURLs
private final FilterEngine domainList
public CrawlStacker(RobotsTxt robots, CrawlQueues cq, CrawlSwitchboard cs, Segment indexSegment, SeedDB peers, boolean acceptLocalURLs, boolean acceptGlobalURLs, FilterEngine domainList)
public int size()
public boolean isEmpty()
public void clear()
public void announceClose()
public void close()
public void enqueueEntry(Request entry)
public void enqueueEntriesAsynchronous(byte[] initiator, java.lang.String profileHandle, java.util.List<AnchorURL> hyperlinks, int timezoneOffset)
private void enqueueEntries(byte[] initiator, java.lang.String profileHandle, java.util.List<AnchorURL> hyperlinks, boolean replace, int timezoneOffset)
public void enqueueEntriesFTP(byte[] initiator, java.lang.String profileHandle, java.lang.String host, int port, java.lang.String user, java.lang.String pw, boolean replace, int timezoneOffset)
public java.lang.String stackSimpleCrawl(DigestURL url)
url
- public java.lang.String stackCrawl(Request entry)
entry
- public java.lang.String checkAcceptanceInitially(DigestURL url, CrawlProfile profile)
url
- profile
- public java.lang.String checkAcceptanceChangeable(DigestURL url, CrawlProfile profile, int depth)
url
- profile
- depth
- public java.lang.String urlInAcceptedDomain(DigestURL url)
url
- public java.lang.String urlInAcceptedDomainHash(byte[] urlhash)
public boolean acceptLocalURLs()
public boolean acceptGlobalURLs()