public final class Switchboard extends serverSwitch
Modifier and Type | Class and Description |
---|---|
class |
Switchboard.receiptSending |
appPath, dataPath, firstInit, log, serverJobs
Constructor and Description |
---|
Switchboard(java.io.File dataPath,
java.io.File appPath,
java.lang.String initPath,
java.lang.String configPath) |
Modifier and Type | Method and Description |
---|---|
void |
addAllToIndex(DigestURL url,
java.util.Map<AnchorURL,java.lang.String> links,
SearchEvent searchEvent,
java.lang.String heuristicName,
java.util.Map<java.lang.String,java.util.regex.Pattern> collections,
boolean doublecheck) |
void |
addToCrawler(java.util.Collection<DigestURL> urls,
boolean asglobal)
add url to Crawler - which itself loads the URL, parses the content and adds it to the index
transparent alternative to "addToIndex" including, double in crawler check, display in crawl monitor
but doesn't return results for a ongoing search
|
void |
addToIndex(java.util.Collection<DigestURL> urls,
SearchEvent searchEvent,
java.lang.String heuristicName,
java.util.Map<java.lang.String,java.util.regex.Pattern> collections,
boolean doublecheck)
load the content of a URL, parse the content and add the content to the index This process is started
concurrently.
|
int |
adminAuthenticated(RequestHeader requestHeader)
check authentication status for request access shall be granted if return value >= 2; these are the
cases where an access is granted to protected pages: - a password is not configured: auth-level 2 -
access from localhost is granted and access comes from localhost: auth-level 3 - a password is
configured and access comes from localhost and the realm-value of a http-authentify String is equal to
the stored base64MD5: auth-level 3 - a password is configured and access comes with matching
http-authentify: auth-level 4
|
float |
averageQPM() |
float |
averageQPMGlobal() |
float |
averageQPMPrivateLocal() |
float |
averageQPMPublicLocal() |
void |
checkInterruption() |
boolean |
cleanProfiles()
Crawl Profiles are saved independently from the queues themselves and therefore
have to be cleaned up from time to time. |
boolean |
cleanupJob() |
int |
cleanupJobSize() |
static void |
clearCaches() |
void |
close() |
IndexingQueueEntry |
condenseDocument(IndexingQueueEntry in) |
void |
continueCrawlJob(java.lang.String jobType)
Continue the previously paused crawling
|
boolean |
crawlJobIsPaused(java.lang.String jobType) |
static int |
currentPPM() |
java.lang.String |
dhtShallTransfer() |
boolean |
dhtTransferJob() |
int |
getIndexingProcessorsQueueSize() |
RankingProfile |
getRanking() |
static Switchboard |
getSwitchboard() |
(package private) java.lang.String |
getSysinfo() |
DigestURL |
getURL(byte[] urlhash) |
void |
heuristicRSS(java.lang.String urlpattern,
SearchEvent searchEvent,
java.lang.String feedName)
Deprecated.
use FederateSearchManager(SearchEvent) instead
|
void |
heuristicSearchResults(URIMetadataNode resulturl)
Get the outbound links of the result and add each unique link to crawler queue
Is input resulturl a full index document with outboundlinks these will be used
otherwise url is loaded and links are extracted/parsed
|
void |
heuristicSite(SearchEvent searchEvent,
java.lang.String host) |
void |
initAutocrawl(boolean activate)
Initialise the Autocrawl thread
|
void |
initBlog() |
void |
initBookmarks() |
void |
initBookmarks(boolean b) |
void |
initMessages() |
void |
initRemoteCrawler(boolean activate)
Initialisize and perform all settings to enable remote crawls
(if remote crawl is not in use, save the resources) If called with
activate==false worker threads are closed and removed (to free resources)
|
void |
initRemoteProxy() |
void |
initWiki() |
boolean |
isAllIPMode() |
boolean |
isGlobalMode() |
boolean |
isInMyCluster(Seed seed) |
boolean |
isInMyCluster(java.lang.String peer) |
boolean |
isIntranetMode() |
boolean |
isIPNoCheckMode()
in nocheck mode the isLocal property is not checked to omit DNS lookup.
|
boolean |
isP2PMode() |
boolean |
isPublicRobinson() |
boolean |
isRobinsonMode() |
boolean |
isTerminated() |
private static void |
loadSeedListConcurrently(SeedDB peers,
java.lang.String seedListFileURL,
java.util.concurrent.atomic.AtomicInteger scc,
int timeout,
boolean checkAge) |
void |
loadSeedLists() |
java.lang.String |
onlineCaution()
checks if the proxy, the local search or remote search was accessed some time before If no limit is
exceeded, null is returned.
|
void |
overwriteNetworkDefinition(java.lang.String sysinfo) |
IndexingQueueEntry |
parseDocument(IndexingQueueEntry in) |
private Document[] |
parseDocument(Response response) |
void |
pauseCrawlJob(java.lang.String jobType,
java.lang.String cause)
With this function the crawling process can be paused
|
private static java.lang.String |
ppRamString(long bytes)
Creates a human readable string from a number which represents the size of a file.
|
void |
processSurrogate(java.io.InputStream is,
java.lang.String name) |
boolean |
processSurrogate(java.lang.String s) |
void |
reload(java.util.Collection<java.lang.String> reloadURLStrings,
java.util.Map<java.lang.String,java.util.regex.Pattern> collections,
boolean doublecheck) |
void |
remove(byte[] urlhash) |
void |
remove(java.util.Collection<java.lang.String> deleteIDs) |
boolean |
schedulerJob() |
int |
schedulerJobSize() |
void |
setHttpServer(YaCyHttpServer server)
set/remember jetty server
|
void |
setRemotecrawlPPM(int ppm) |
boolean |
shallTerminate() |
java.lang.String |
stackUrl(CrawlProfile profile,
DigestURL url)
stack the url to the crawler
|
void |
stackURLs(java.util.Set<DigestURL> rootURLs,
CrawlProfile profile,
java.util.Set<DigestURL> successurls,
java.util.Map<DigestURL,java.lang.String> failurls) |
void |
storeDocumentIndex(IndexingQueueEntry in) |
private void |
storeDocumentIndex(Response queueEntry,
java.util.Map<java.lang.String,java.util.regex.Pattern> collections,
Document document,
Condenser condenser,
SearchEvent searchEvent,
java.lang.String sourceName) |
void |
surrogateFreeMem() |
boolean |
surrogateProcess() |
int |
surrogateQueueSize() |
void |
switchNetwork(java.lang.String networkDefinition) |
void |
terminate(long delay,
java.lang.String reason) |
void |
terminate(java.lang.String reason) |
java.lang.String |
toIndexer(Response response)
pass a response to the indexer
|
void |
updateMySeed() |
HarvestProcess |
urlExists(java.lang.String hash)
tests if hash occurs in any database.
|
void |
urlRemove(Segment segment,
byte[] hash) |
boolean |
verifyAuthentication(RequestHeader header) |
boolean |
waitForShutdown() |
IndexingQueueEntry |
webStructureAnalysis(IndexingQueueEntry in) |
configKeys, deployThread, deployThread, genRandomPassword, genRandomPassword, getAppPath, getAppPath, getConfig, getConfigArray, getConfigBool, getConfigFileFromWebOrLocally, getConfigFloat, getConfigInt, getConfigLong, getConfigSet, getDataPath, getDataPath, getHttpServer, getLocalPort, getLog, getPublicPort, getRemoved, getThread, handleBusyState, intermissionAllThreads, isConnectedViaUpnp, myPublicIP, myPublicIPs, removeConfig, removeUpnpPort, setConfig, setConfig, setConfig, setConfig, setConfig, setConfig, setConfig, setConfig, setConnectedViaUpnp, setLog, setThreadPerformance, setUpnpPorts, terminateAllThreads, terminateThread, threadNames, toString
static final java.lang.String SOLR_COLLECTION_CONFIGURATION_NAME_OLD
public static final java.lang.String SOLR_COLLECTION_CONFIGURATION_NAME
public static final java.lang.String SOLR_WEBGRAPH_CONFIGURATION_NAME
public static int xstackCrawlSlots
public static long lastPPMUpdate
private static final int dhtMaxContainerCount
private int dhtMaxReferenceCount
public static java.util.SortedSet<java.lang.String> badwords
public static java.util.SortedSet<java.lang.String> stopwords
public static java.util.SortedSet<java.lang.String> blueList
public static java.util.SortedSet<byte[]> stopwordHashes
public static Blacklist urlBlacklist
public static WikiParser wikiParser
public java.io.File htCachePath
public final java.io.File dictionariesPath
public final java.io.File classificationPath
public java.io.File listsPath
public java.io.File htDocsPath
public java.io.File workPath
public java.io.File releasePath
public java.io.File networkRoot
public java.io.File queuesRoot
public java.io.File surrogatesInPath
public java.io.File surrogatesOutPath
public Segment index
public LoaderDispatcher loader
public CrawlSwitchboard crawler
public CrawlQueues crawlQueues
public CrawlStacker crawlStacker
public MessageBoard messageDB
public WikiBoard wikiDB
public BlogBoard blogDB
public BlogBoardComments blogCommentDB
public RobotsTxt robots
public java.util.Map<java.lang.String,java.lang.Object[]> outgoingCookies
public java.util.Map<java.lang.String,java.lang.Object[]> incomingCookies
public volatile long proxyLastAccess
public volatile long localSearchLastAccess
public volatile long remoteSearchLastAccess
public volatile long adminAuthenticationLastAccess
public volatile long optimizeLastRun
public Network yc
public ResourceObserver observer
public UserDB userDB
public BookmarksDB bookmarksDB
public WebStructureGraph webStructure
public java.util.concurrent.ConcurrentHashMap<java.lang.String,java.util.TreeSet<java.lang.Long>> localSearchTracker
public java.util.concurrent.ConcurrentHashMap<java.lang.String,java.util.TreeSet<java.lang.Long>> remoteSearchTracker
public long indexedPages
public int searchQueriesRobinsonFromLocal
public int searchQueriesRobinsonFromRemote
public float searchQueriesGlobal
public java.util.SortedSet<byte[]> clusterhashes
public java.util.List<java.util.regex.Pattern> networkWhitelist
public java.util.List<java.util.regex.Pattern> networkBlacklist
public FilterEngine domainList
private Dispatcher dhtDispatcher
public java.util.concurrent.LinkedBlockingQueue<java.lang.String> trail
public SeedDB peers
public WorkTables tables
public Tray tray
private long lastStats
public WorkflowProcessor<IndexingQueueEntry> indexingDocumentProcessor
public WorkflowProcessor<IndexingQueueEntry> indexingCondensementProcessor
public WorkflowProcessor<IndexingQueueEntry> indexingAnalysisProcessor
public WorkflowProcessor<IndexingQueueEntry> indexingStorageProcessor
public RobotsTxtConfig robotstxtConfig
public boolean useTailCache
public boolean exceed134217727
public final long startupTime
private final java.util.concurrent.Semaphore shutdownSync
private boolean terminate
private boolean startupAction
private static Switchboard sb
public java.util.HashMap<java.lang.String,java.lang.Object[]> crawlJobsStatus
private static long indeSizeCache
private static long indexSizeTime
public Switchboard(java.io.File dataPath, java.io.File appPath, java.lang.String initPath, java.lang.String configPath)
final java.lang.String getSysinfo()
public void setHttpServer(YaCyHttpServer server)
serverSwitch
setHttpServer
in class serverSwitch
public int getIndexingProcessorsQueueSize()
public void overwriteNetworkDefinition(java.lang.String sysinfo) throws java.io.FileNotFoundException, java.io.IOException
java.io.FileNotFoundException
java.io.IOException
public void switchNetwork(java.lang.String networkDefinition) throws java.io.FileNotFoundException, java.io.IOException
java.io.FileNotFoundException
java.io.IOException
public void setRemotecrawlPPM(int ppm)
public void initRemoteCrawler(boolean activate)
activate
- true=enable, false=disablepublic void initAutocrawl(boolean activate)
activate
- true=enable, false=disablepublic void initMessages() throws java.io.IOException
java.io.IOException
public void initWiki() throws java.io.IOException
java.io.IOException
public void initBlog() throws java.io.IOException
java.io.IOException
public void initBookmarks() throws java.io.IOException
java.io.IOException
public static Switchboard getSwitchboard()
public boolean isP2PMode()
public boolean isIntranetMode()
public boolean isGlobalMode()
public boolean isAllIPMode()
public boolean isIPNoCheckMode()
public boolean isRobinsonMode()
public boolean isPublicRobinson()
public boolean isInMyCluster(java.lang.String peer)
public boolean isInMyCluster(Seed seed)
public HarvestProcess urlExists(java.lang.String hash) throws java.io.IOException
hash
- java.io.IOException
public void urlRemove(Segment segment, byte[] hash)
public DigestURL getURL(byte[] urlhash) throws java.io.IOException
java.io.IOException
public RankingProfile getRanking()
public java.lang.String onlineCaution()
private static java.lang.String ppRamString(long bytes)
bytes
- the length of a filepublic boolean cleanProfiles() throws java.lang.InterruptedException
Crawl Profiles
are saved independently from the queues themselves and therefore
have to be cleaned up from time to time. This method only performs the clean-up if - and only if - the
switchboard
, loader
and local
crawl
queues are all empty.
Then it iterates through all existing crawl profiles
and removes all profiles
which are not hard-coded.
If this method encounters DB-failures, the profile DB will be reseted and true
will be returned
InterruptedException
- if the current thread has been interrupted, i.e. by the shutdown
procedurejava.lang.InterruptedException
hardcoded
,
hardcoded
,
hardcoded
,
hardcoded
public void close()
public java.lang.String toIndexer(Response response)
response
- public boolean processSurrogate(java.lang.String s)
public void processSurrogate(java.io.InputStream is, java.lang.String name) throws java.io.IOException
java.io.IOException
public int surrogateQueueSize()
public void surrogateFreeMem()
public boolean surrogateProcess()
public static void clearCaches()
public int schedulerJobSize()
public boolean schedulerJob()
public int cleanupJobSize()
public boolean cleanupJob()
public void pauseCrawlJob(java.lang.String jobType, java.lang.String cause)
jobType
- public void continueCrawlJob(java.lang.String jobType)
jobType
- public boolean crawlJobIsPaused(java.lang.String jobType)
jobType
- true
if crawling was paused or false
otherwisepublic IndexingQueueEntry parseDocument(IndexingQueueEntry in)
private Document[] parseDocument(Response response) throws java.lang.InterruptedException
java.lang.InterruptedException
public IndexingQueueEntry condenseDocument(IndexingQueueEntry in)
public IndexingQueueEntry webStructureAnalysis(IndexingQueueEntry in)
public void storeDocumentIndex(IndexingQueueEntry in)
private void storeDocumentIndex(Response queueEntry, java.util.Map<java.lang.String,java.util.regex.Pattern> collections, Document document, Condenser condenser, SearchEvent searchEvent, java.lang.String sourceName)
queueEntry
- collections
- document
- condenser
- searchEvent
- sourceName
- if this document was created by a crawl, then the sourceName contains the crawl hashpublic final void addAllToIndex(DigestURL url, java.util.Map<AnchorURL,java.lang.String> links, SearchEvent searchEvent, java.lang.String heuristicName, java.util.Map<java.lang.String,java.util.regex.Pattern> collections, boolean doublecheck)
public void reload(java.util.Collection<java.lang.String> reloadURLStrings, java.util.Map<java.lang.String,java.util.regex.Pattern> collections, boolean doublecheck)
public void remove(java.util.Collection<java.lang.String> deleteIDs)
public void remove(byte[] urlhash)
public void stackURLs(java.util.Set<DigestURL> rootURLs, CrawlProfile profile, java.util.Set<DigestURL> successurls, java.util.Map<DigestURL,java.lang.String> failurls)
public java.lang.String stackUrl(CrawlProfile profile, DigestURL url)
profile
- url
- public void addToIndex(java.util.Collection<DigestURL> urls, SearchEvent searchEvent, java.lang.String heuristicName, java.util.Map<java.lang.String,java.util.regex.Pattern> collections, boolean doublecheck)
url
- the url that shall be indexedsearchEvent
- (optional) a search event that shall get results from the indexed pages directly
feeded. If object is null then it is ignoredjava.io.IOException
Parser.Failure
public void addToCrawler(java.util.Collection<DigestURL> urls, boolean asglobal)
url
- the url that shall be indexedasglobal
- true adds the url to global crawl queue (for remote crawling), false to the local crawlerpublic void initBookmarks(boolean b)
public int adminAuthenticated(RequestHeader requestHeader)
requestHeader
- - requestHeader..AUTHORIZATION = B64encode("adminname:password") or = B64encode("adminname:valueOf_Base64MD5cft")
- adminAccountBase64MD5 = MD5(B64encode("adminname:password") or = "MD5:"+MD5("adminname:peername:password")public boolean verifyAuthentication(RequestHeader header)
public java.lang.String dhtShallTransfer()
public boolean dhtTransferJob()
public final void heuristicSite(SearchEvent searchEvent, java.lang.String host)
public final void heuristicSearchResults(URIMetadataNode resulturl)
resulturl
- the result doc which outbound links to add to crawler@Deprecated public final void heuristicRSS(java.lang.String urlpattern, SearchEvent searchEvent, java.lang.String feedName)
urlpattern
- the search query url (e.g. http://search.org?query=searchword)searchEvent
- feedName
- short/internal name of the remote systempublic static int currentPPM()
public float averageQPM()
public float averageQPMGlobal()
public float averageQPMPrivateLocal()
public float averageQPMPublicLocal()
public void updateMySeed()
public void loadSeedLists()
private static void loadSeedListConcurrently(SeedDB peers, java.lang.String seedListFileURL, java.util.concurrent.atomic.AtomicInteger scc, int timeout, boolean checkAge)
public void initRemoteProxy()
public void checkInterruption() throws java.lang.InterruptedException
java.lang.InterruptedException
public void terminate(long delay, java.lang.String reason)
public boolean shallTerminate()
public void terminate(java.lang.String reason)
public boolean isTerminated()
public boolean waitForShutdown() throws java.lang.InterruptedException
java.lang.InterruptedException