public class Latency
extends java.lang.Object
Modifier and Type | Class and Description |
---|---|
static class |
Latency.Host |
Modifier and Type | Field and Description |
---|---|
private static java.util.concurrent.ConcurrentHashMap<java.lang.String,Latency.Host> |
map |
private static int |
mapMaxSize |
Constructor and Description |
---|
Latency() |
Modifier and Type | Method and Description |
---|---|
static long |
getDomainSleepTime(RobotsTxt robots,
CrawlProfile profileEntry,
DigestURL crawlURL)
Get the minimum sleep time for a given url.
|
static long |
getRobotsTime(RobotsTxt robots,
DigestURL crawlURL,
ClientIdentification.Agent agent)
load a robots.txt to get the robots time.
|
private static Latency.Host |
host(DigestURL url) |
static java.util.Iterator<java.util.Map.Entry<java.lang.String,Latency.Host>> |
iterator() |
static void |
updateAfterLoad(DigestURL url,
long time)
update the latency entry after a host was accessed to load a file
|
static void |
updateAfterSelection(DigestURL url,
long robotsCrawlDelay)
update the latency entry after a host was selected for queueing into the loader
|
static void |
updateBeforeLoad(DigestURL url)
update the latency entry before a host is accessed
|
static int |
waitingRemaining(DigestURL url,
RobotsTxt robots,
ClientIdentification.Agent agent)
calculates how long should be waited until the domain can be accessed again
this follows from:
- given minimum access times
- the fact that an url is a CGI url or not
- the times that the domain was accessed (flux factor)
- the response latency of the domain
- and a given minimum access time as given in robots.txt
|
static java.lang.String |
waitingRemainingExplain(DigestURL url,
RobotsTxt robots,
ClientIdentification.Agent agent) |
static int |
waitingRemainingGuessed(java.lang.String hostname,
int port,
java.lang.String hosthash,
RobotsTxt robots,
ClientIdentification.Agent agent)
guess a minimum waiting time
the time is not correct, because if the domain was not checked yet by the robots.txt delay value, it is too low
|
static int |
waitingRobots(MultiProtocolURL url,
RobotsTxt robots,
ClientIdentification.Agent agent)
Return the waiting time demanded by the robots.txt file of the target host.
|
private static int |
waitingRobots(java.lang.String hostport,
RobotsTxt robots,
ClientIdentification.Agent agent,
boolean fetchOnlineIfNotAvailableOrNotFresh) |
private static final int mapMaxSize
private static final java.util.concurrent.ConcurrentHashMap<java.lang.String,Latency.Host> map
public static void updateAfterSelection(DigestURL url, long robotsCrawlDelay)
url
- robotsCrawlDelay
- the crawl-delay given by the robots; 0 if not existpublic static void updateBeforeLoad(DigestURL url)
url
- time
- the time to load the file in millisecondspublic static void updateAfterLoad(DigestURL url, long time)
url
- time
- the time to load the file in millisecondsprivate static Latency.Host host(DigestURL url)
public static java.util.Iterator<java.util.Map.Entry<java.lang.String,Latency.Host>> iterator()
public static int waitingRobots(MultiProtocolURL url, RobotsTxt robots, ClientIdentification.Agent agent)
url
- robots
- thisAgents
- private static int waitingRobots(java.lang.String hostport, RobotsTxt robots, ClientIdentification.Agent agent, boolean fetchOnlineIfNotAvailableOrNotFresh)
public static int waitingRemainingGuessed(java.lang.String hostname, int port, java.lang.String hosthash, RobotsTxt robots, ClientIdentification.Agent agent)
hostname
- hosthash
- robots
- agent
- public static int waitingRemaining(DigestURL url, RobotsTxt robots, ClientIdentification.Agent agent)
agent
- public static java.lang.String waitingRemainingExplain(DigestURL url, RobotsTxt robots, ClientIdentification.Agent agent)
public static long getDomainSleepTime(RobotsTxt robots, CrawlProfile profileEntry, DigestURL crawlURL)
robots
- profileEntry
- crawlURL
- public static long getRobotsTime(RobotsTxt robots, DigestURL crawlURL, ClientIdentification.Agent agent)
robots
- profileEntry
- crawlURL
-