public final class LoaderDispatcher
extends java.lang.Object
Modifier and Type | Class and Description |
---|---|
private class |
LoaderDispatcher.Loader |
Modifier and Type | Field and Description |
---|---|
private static java.util.concurrent.ConcurrentHashMap<java.lang.String,java.lang.Long> |
accessTime |
private static int |
accessTimeMaxsize |
private FileLoader |
fileLoader |
private FTPLoader |
ftpLoader |
private HTTPLoader |
httpLoader |
private java.util.concurrent.ConcurrentHashMap<DigestURL,java.util.concurrent.Semaphore> |
loaderSteering |
private static ConcurrentLog |
log |
private Switchboard |
sb |
private SMBLoader |
smbLoader |
private java.util.HashSet<java.lang.String> |
supportedProtocols |
Constructor and Description |
---|
LoaderDispatcher(Switchboard sb) |
Modifier and Type | Method and Description |
---|---|
private void |
checkAccessTime(ClientIdentification.Agent agent,
DigestURL url)
Check access time: this is a double-check (we checked possibly already in the balancer)
to make sure that we don't DoS the target by mistake
|
static void |
cleanupAccessTimeTable(long timeout) |
java.util.HashSet<java.lang.String> |
getSupportedProtocols() |
boolean |
isSupportedProtocol(java.lang.String protocol) |
void |
load(DigestURL url,
CacheStrategy cacheStratgy,
int maxFileSize,
java.io.File targetFile,
Blacklist.BlacklistType blacklistType,
ClientIdentification.Agent agent) |
Response |
load(Request request,
CacheStrategy cacheStrategy,
Blacklist.BlacklistType blacklistType,
ClientIdentification.Agent agent) |
Response |
load(Request request,
CacheStrategy cacheStrategy,
int maxFileSize,
Blacklist.BlacklistType blacklistType,
ClientIdentification.Agent agent)
loads a resource from cache or web/ftp/smb/file
on concurrent execution waits max 5 sec for the prev.
|
byte[] |
loadContent(Request request,
CacheStrategy cacheStrategy,
Blacklist.BlacklistType blacklistType,
ClientIdentification.Agent agent)
load the url as byte[] content from the web or the cache
|
Document |
loadDocument(DigestURL location,
CacheStrategy cachePolicy,
Blacklist.BlacklistType blacklistType,
ClientIdentification.Agent agent) |
Document[] |
loadDocuments(Request request,
CacheStrategy cacheStrategy,
int maxFileSize,
Blacklist.BlacklistType blacklistType,
ClientIdentification.Agent agent) |
private Response |
loadFromCache(Request request,
CacheStrategy cacheStrategy,
ClientIdentification.Agent agent,
DigestURL url,
CrawlProfile crawlProfile)
Try loading requested resource from cache according to cache strategy
|
void |
loadIfNotExistBackground(DigestURL url,
java.io.File cache,
int maxFileSize,
Blacklist.BlacklistType blacklistType,
ClientIdentification.Agent agent) |
void |
loadIfNotExistBackground(DigestURL url,
int maxFileSize,
Blacklist.BlacklistType blacklistType,
ClientIdentification.Agent agent) |
private Response |
loadInternal(Request request,
CacheStrategy cacheStrategy,
int maxFileSize,
Blacklist.BlacklistType blacklistType,
ClientIdentification.Agent agent)
load a resource from the web, from ftp, from smb or a file
|
java.util.Map<AnchorURL,java.lang.String> |
loadLinks(DigestURL url,
CacheStrategy cacheStrategy,
Blacklist.BlacklistType blacklistType,
ClientIdentification.Agent agent,
int timezoneOffset)
load all links from a resource
|
java.io.InputStream |
openInputStream(Request request,
CacheStrategy cacheStrategy,
Blacklist.BlacklistType blacklistType,
ClientIdentification.Agent agent)
Open url as InputStream from the web or the cache
|
private java.io.InputStream |
openInputStreamInternal(Request request,
CacheStrategy cacheStrategy,
int maxFileSize,
Blacklist.BlacklistType blacklistType,
ClientIdentification.Agent agent)
Open an InputStream on a resource from the web, from ftp, from smb or a file
|
private int |
protocolMaxFileSize(DigestURL url) |
Request |
request(DigestURL url,
boolean forText,
boolean global)
generate a request object
|
private static final int accessTimeMaxsize
private static final ConcurrentLog log
private static final java.util.concurrent.ConcurrentHashMap<java.lang.String,java.lang.Long> accessTime
private final Switchboard sb
private final java.util.HashSet<java.lang.String> supportedProtocols
private final HTTPLoader httpLoader
private final FTPLoader ftpLoader
private final SMBLoader smbLoader
private final FileLoader fileLoader
private final java.util.concurrent.ConcurrentHashMap<DigestURL,java.util.concurrent.Semaphore> loaderSteering
public LoaderDispatcher(Switchboard sb)
public boolean isSupportedProtocol(java.lang.String protocol)
public java.util.HashSet<java.lang.String> getSupportedProtocols()
public Request request(DigestURL url, boolean forText, boolean global)
url
- the target urlforText
- shows that this was a for-text crawling requestglobal
- shows that this was a global crawling requestpublic void load(DigestURL url, CacheStrategy cacheStratgy, int maxFileSize, java.io.File targetFile, Blacklist.BlacklistType blacklistType, ClientIdentification.Agent agent) throws java.io.IOException
java.io.IOException
public Response load(Request request, CacheStrategy cacheStrategy, Blacklist.BlacklistType blacklistType, ClientIdentification.Agent agent) throws java.io.IOException
java.io.IOException
public Response load(Request request, CacheStrategy cacheStrategy, int maxFileSize, Blacklist.BlacklistType blacklistType, ClientIdentification.Agent agent) throws java.io.IOException
request
- the request essentialscacheStrategy
- strategy according to NOCACHE, IFFRESH, IFEXIST, CACHEONLYmaxFileSize
- blacklistType
- agent
- java.io.IOException
private Response loadInternal(Request request, CacheStrategy cacheStrategy, int maxFileSize, Blacklist.BlacklistType blacklistType, ClientIdentification.Agent agent) throws java.io.IOException
request
- the request essentialscacheStratgy
- strategy according to NOCACHE, IFFRESH, IFEXIST, CACHEONLYjava.io.IOException
private Response loadFromCache(Request request, CacheStrategy cacheStrategy, ClientIdentification.Agent agent, DigestURL url, CrawlProfile crawlProfile) throws java.io.IOException
request
- request to resourcecacheStrategy
- cache strategy to useagent
- agent identifierurl
- resource urlcrawlProfile
- crawl profilejava.io.IOException
- when an error occuredprivate java.io.InputStream openInputStreamInternal(Request request, CacheStrategy cacheStrategy, int maxFileSize, Blacklist.BlacklistType blacklistType, ClientIdentification.Agent agent) throws java.io.IOException
request
- the request essentialscacheStratgy
- strategy according to NOCACHE, IFFRESH, IFEXIST, CACHEONLYjava.io.IOException
- when url is malformed, blacklisted, or CacheStrategy is CACHEONLY and content is unavailableprivate void checkAccessTime(ClientIdentification.Agent agent, DigestURL url)
agent
- agent identifierurl
- target urlprivate int protocolMaxFileSize(DigestURL url)
public byte[] loadContent(Request request, CacheStrategy cacheStrategy, Blacklist.BlacklistType blacklistType, ClientIdentification.Agent agent) throws java.io.IOException
request
- cacheStrategy
- timeout
- byte[]
java.io.IOException
public java.io.InputStream openInputStream(Request request, CacheStrategy cacheStrategy, Blacklist.BlacklistType blacklistType, ClientIdentification.Agent agent) throws java.io.IOException
request
- must be not nullcacheStrategy
- cache strategy to useblacklistType
- black listagent
- agent identification for HTTP requestsjava.io.IOException
- when url is malformed or blacklistedpublic Document[] loadDocuments(Request request, CacheStrategy cacheStrategy, int maxFileSize, Blacklist.BlacklistType blacklistType, ClientIdentification.Agent agent) throws java.io.IOException, Parser.Failure
java.io.IOException
Parser.Failure
public Document loadDocument(DigestURL location, CacheStrategy cachePolicy, Blacklist.BlacklistType blacklistType, ClientIdentification.Agent agent) throws java.io.IOException
java.io.IOException
public final java.util.Map<AnchorURL,java.lang.String> loadLinks(DigestURL url, CacheStrategy cacheStrategy, Blacklist.BlacklistType blacklistType, ClientIdentification.Agent agent, int timezoneOffset) throws java.io.IOException
url
- the url that shall be loadedcacheStrategy
- the cache strategyjava.io.IOException
public static void cleanupAccessTimeTable(long timeout)
public void loadIfNotExistBackground(DigestURL url, java.io.File cache, int maxFileSize, Blacklist.BlacklistType blacklistType, ClientIdentification.Agent agent)
public void loadIfNotExistBackground(DigestURL url, int maxFileSize, Blacklist.BlacklistType blacklistType, ClientIdentification.Agent agent)