public class HostBalancer extends java.lang.Object implements Balancer
Modifier and Type | Field and Description |
---|---|
static HandleMap |
depthCache |
private boolean |
exceed134217727 |
private java.io.File |
hostsPath |
private static ConcurrentLog |
log |
private int |
onDemandLimit |
private java.util.Map<java.lang.String,HostQueue> |
queues |
private java.util.Set<java.lang.String> |
roundRobinHostHashes |
Constructor and Description |
---|
HostBalancer(java.io.File hostsPath,
int onDemandLimit,
boolean exceed134217727) |
Modifier and Type | Method and Description |
---|---|
void |
clear()
delete all urls from the stack
|
void |
close()
close the balancer object
|
Request |
get(byte[] urlhash)
get one url from the crawl stack
|
java.util.Map<java.lang.String,java.lang.Integer[]> |
getDomainStackHosts(RobotsTxt robots)
get a list of domains that are currently maintained as domain stacks
|
java.util.List<Request> |
getDomainStackReferences(java.lang.String host,
int maxcount,
long maxtime)
get lists of crawl request entries for a specific host
|
boolean |
getExceed134217727() |
int |
getOnDemandLimit() |
boolean |
has(byte[] urlhashb)
check if given url hash is contained in the balancer stack
|
private void |
init()
fills the queue by scanning the hostsPath directory in a thread to
return immediately (as large unfinished crawls may take longer to load)
|
boolean |
isEmpty()
check if stack is empty
|
java.util.Iterator<Request> |
iterator()
iterate through all requests in the queue
|
Request |
pop(boolean delay,
CrawlSwitchboard cs,
RobotsTxt robots)
get the next entry in this crawl queue in such a way that the domain access time delta is maximized
and always above the given minimum delay time.
|
java.lang.String |
push(Request entry,
CrawlProfile profile,
RobotsTxt robots)
push a request to one of the host queues.
|
int |
remove(HandleSet urlHashes) |
int |
removeAllByHostHashes(java.util.Set<java.lang.String> hosthashes)
delete all urls which are stored for given host hashes
|
int |
removeAllByProfileHandle(java.lang.String profileHandle,
long timeout)
delete all urls from the stack by given profile handle
|
int |
size()
get the size of the stack
|
private static final ConcurrentLog log
public static final HandleMap depthCache
private final java.io.File hostsPath
private final boolean exceed134217727
private final java.util.Map<java.lang.String,HostQueue> queues
private final java.util.Set<java.lang.String> roundRobinHostHashes
private final int onDemandLimit
public HostBalancer(java.io.File hostsPath, int onDemandLimit, boolean exceed134217727)
private void init()
public void close()
Balancer
public void clear()
Balancer
public Request get(byte[] urlhash) throws java.io.IOException
Balancer
public int removeAllByProfileHandle(java.lang.String profileHandle, long timeout) throws java.io.IOException, SpaceExceededException
Balancer
removeAllByProfileHandle
in interface Balancer
java.io.IOException
SpaceExceededException
public int removeAllByHostHashes(java.util.Set<java.lang.String> hosthashes)
removeAllByHostHashes
in interface Balancer
hosthashes
- public int remove(HandleSet urlHashes) throws java.io.IOException
public boolean has(byte[] urlhashb)
Balancer
public int size()
Balancer
public boolean isEmpty()
Balancer
public int getOnDemandLimit()
getOnDemandLimit
in interface Balancer
public boolean getExceed134217727()
getExceed134217727
in interface Balancer
public java.lang.String push(Request entry, CrawlProfile profile, RobotsTxt robots) throws java.io.IOException, SpaceExceededException
push
in interface Balancer
entry
- profile
- robots
- java.io.IOException
SpaceExceededException
public Request pop(boolean delay, CrawlSwitchboard cs, RobotsTxt robots) throws java.io.IOException
pop
in interface Balancer
delay
- true if the requester demands forced delays using explicit thread sleepprofile
- java.io.IOException
SpaceExceededException
public java.util.Iterator<Request> iterator() throws java.io.IOException
Balancer
public java.util.Map<java.lang.String,java.lang.Integer[]> getDomainStackHosts(RobotsTxt robots)
getDomainStackHosts
in interface Balancer
public java.util.List<Request> getDomainStackReferences(java.lang.String host, int maxcount, long maxtime)
getDomainStackReferences
in interface Balancer
host
- maxcount
- maxtime
-