public class RobotsTxt
extends java.lang.Object
Modifier and Type | Class and Description |
---|---|
static class |
RobotsTxt.CheckEntry |
private static class |
RobotsTxt.DomSync |
Modifier and Type | Field and Description |
---|---|
private LoaderDispatcher |
loader |
private static ConcurrentLog |
log |
protected static java.lang.String |
ROBOTS_DB_PATH_SEPARATOR |
protected static java.util.regex.Pattern |
ROBOTS_DB_PATH_SEPARATOR_MATCHER |
protected static java.lang.String |
ROBOTS_TXT_PATH |
private java.util.concurrent.ConcurrentMap<java.lang.String,RobotsTxt.DomSync> |
syncObjects |
private WorkTables |
tables |
Constructor and Description |
---|
RobotsTxt(WorkTables worktables,
LoaderDispatcher loader) |
Modifier and Type | Method and Description |
---|---|
private java.lang.String |
addEntry(RobotsTxtEntry entry) |
void |
clear() |
void |
delete(MultiProtocolURL theURL) |
void |
ensureExist(MultiProtocolURL theURL,
ClientIdentification.Agent agent,
boolean concurrent) |
RobotsTxtEntry |
getEntry(MultiProtocolURL theURL,
ClientIdentification.Agent agent) |
RobotsTxtEntry |
getEntry(java.lang.String urlHostPort,
ClientIdentification.Agent agent,
boolean fetchOnlineIfNotAvailableOrNotFresh) |
static java.lang.String |
getHostPort(MultiProtocolURL theURL) |
static boolean |
isRobotsURL(MultiProtocolURL url) |
java.util.Collection<RobotsTxt.CheckEntry> |
massCrawlCheck(java.util.Collection<DigestURL> rootURLs,
ClientIdentification.Agent userAgent,
int concurrency) |
private void |
processNewEntry(DigestURL robotsURL,
Response response,
java.lang.String[] thisAgents) |
private void |
processOldEntry(RobotsTxtEntry robotsTxt4Host,
DigestURL robotsURL,
BEncodedHeap robotsTable) |
static DigestURL |
robotsURL(java.lang.String urlHostPort)
generate a robots.txt url.
|
int |
size() |
private static final ConcurrentLog log
protected static final java.lang.String ROBOTS_TXT_PATH
protected static final java.lang.String ROBOTS_DB_PATH_SEPARATOR
protected static final java.util.regex.Pattern ROBOTS_DB_PATH_SEPARATOR_MATCHER
private final java.util.concurrent.ConcurrentMap<java.lang.String,RobotsTxt.DomSync> syncObjects
private final WorkTables tables
private final LoaderDispatcher loader
public RobotsTxt(WorkTables worktables, LoaderDispatcher loader)
public void clear() throws java.io.IOException
java.io.IOException
public int size() throws java.io.IOException
java.io.IOException
public RobotsTxtEntry getEntry(MultiProtocolURL theURL, ClientIdentification.Agent agent)
public RobotsTxtEntry getEntry(java.lang.String urlHostPort, ClientIdentification.Agent agent, boolean fetchOnlineIfNotAvailableOrNotFresh)
public void delete(MultiProtocolURL theURL)
public void ensureExist(MultiProtocolURL theURL, ClientIdentification.Agent agent, boolean concurrent)
private void processOldEntry(RobotsTxtEntry robotsTxt4Host, DigestURL robotsURL, BEncodedHeap robotsTable)
private void processNewEntry(DigestURL robotsURL, Response response, java.lang.String[] thisAgents)
private java.lang.String addEntry(RobotsTxtEntry entry)
public static final java.lang.String getHostPort(MultiProtocolURL theURL)
public static boolean isRobotsURL(MultiProtocolURL url)
public static DigestURL robotsURL(java.lang.String urlHostPort)
urlHostPort
- a string of the form public java.util.Collection<RobotsTxt.CheckEntry> massCrawlCheck(java.util.Collection<DigestURL> rootURLs, ClientIdentification.Agent userAgent, int concurrency)