public class Document
extends java.lang.Object
Modifier and Type | Field and Description |
---|---|
private java.util.Collection<AnchorURL> |
anchors |
private java.util.LinkedHashMap<AnchorURL,java.lang.String> |
applinks |
private java.util.LinkedHashMap<AnchorURL,java.lang.String> |
audiolinks |
static java.lang.String |
CANONICAL_MARKER |
private java.lang.String |
charset |
private int |
crawldepth |
private java.lang.StringBuilder |
creator |
private java.util.List<java.lang.String> |
descriptions |
private java.util.Set<AnchorURL> |
emaillinks |
static java.lang.String |
EMBED_MARKER |
private MultiProtocolURL |
favicon |
static java.lang.String |
FRAME_MARKER |
private java.util.Map<java.lang.String,java.util.Set<java.lang.String>> |
generic_facets |
private java.util.LinkedHashMap<AnchorURL,java.lang.String> |
hyperlinks |
static java.lang.String |
IFRAME_MARKER |
private java.util.LinkedHashMap<DigestURL,ImageEntry> |
images |
private java.util.LinkedHashMap<DigestURL,java.lang.String> |
inboundlinks |
private boolean |
indexingDenied |
private java.util.Set<java.lang.String> |
keywords |
private java.util.Set<java.lang.String> |
languages |
private java.util.Date |
lastModified |
private double |
lat |
private double |
lon |
private java.lang.String |
mimeType |
private java.util.LinkedHashMap<DigestURL,java.lang.String> |
outboundlinks |
private java.lang.Object |
parserObject |
private java.lang.String |
publisher |
private boolean |
resorted |
private java.util.LinkedHashMap<DigestURL,java.lang.String> |
rss |
private java.util.List<java.lang.String> |
sections |
private DigestURL |
source |
private java.lang.Object |
text |
private java.util.List<java.lang.String> |
titles |
private java.util.LinkedHashMap<AnchorURL,java.lang.String> |
videolinks |
Constructor and Description |
---|
Document(DigestURL location,
java.lang.String mimeType,
java.lang.String charset,
java.lang.Object parserObject,
java.util.Set<java.lang.String> languages,
java.lang.String[] keywords,
java.util.List<java.lang.String> titles,
java.lang.String author,
java.lang.String publisher,
java.lang.String[] sections,
java.util.List<java.lang.String> abstrcts,
double lon,
double lat,
java.lang.Object text,
java.util.Collection<AnchorURL> anchors,
java.util.LinkedHashMap<DigestURL,java.lang.String> rss,
java.util.LinkedHashMap<DigestURL,ImageEntry> images,
boolean indexingDenied,
java.util.Date lastModified) |
Modifier and Type | Method and Description |
---|---|
protected void |
addMetatags(java.util.Map<java.lang.String,java.util.Set<Tagging.Metatag>> tags)
add the given words to the set of keywords.
|
void |
addSubDocuments(Document[] docs)
Adds the main content of subdocuments to this document.
|
void |
addTags(java.util.Set<java.lang.String> tags)
add the given words to the set of keywords.
|
private static java.util.Map<AnchorURL,java.lang.String> |
allReflinks(java.util.Collection<?> links) |
static java.util.Map<MultiProtocolURL,java.lang.String> |
allSubpaths(java.util.Collection<?> links) |
void |
close() |
static java.util.Map<java.lang.String,java.util.Set<java.lang.String>> |
computeGenericFacets(java.util.Map<java.lang.String,java.util.Set<Tagging.Metatag>> tags)
compute generic facets
|
java.lang.String |
dc_creator() |
java.lang.String[] |
dc_description() |
java.lang.String |
dc_format() |
java.lang.String |
dc_identifier() |
java.lang.String |
dc_language()
compute a set of languages that this document contains
the language is not computed using a statistical analysis of the content, only from given metadata that came with the document
if there are several languages defined in the document, the TLD is taken to check which one should be picked
If there is no metadata at all, null is returned
|
java.lang.String |
dc_publisher() |
DigestURL |
dc_source() |
java.util.Set<java.lang.String> |
dc_subject()
Get the set of keywords associated with the document
|
java.lang.String |
dc_subject(char separator)
Get the set of keywords associated with the document and string
each keyword separated by the separator character
|
java.lang.String |
dc_title()
Get the main document title.
|
private static java.lang.String |
description(Document d,
java.lang.String tagname) |
java.util.Collection<AnchorURL> |
getAnchors()
All anchor links of the document
(this includes mailto links)
|
java.util.Map<AnchorURL,java.lang.String> |
getApplinks() |
static java.util.Map<DigestURL,java.lang.String> |
getApplinks(Document[] documents) |
java.util.Map<AnchorURL,java.lang.String> |
getAudiolinks() |
static java.util.Map<DigestURL,java.lang.String> |
getAudiolinks(Document[] documents) |
java.lang.String |
getCharset() |
Classification.ContentDomain |
getContentDomain()
Get the content domain of a document.
|
java.util.Set<java.lang.String> |
getContentLanguages() |
int |
getDepth() |
java.util.Set<AnchorURL> |
getEmaillinks() |
MultiProtocolURL |
getFavicon() |
java.lang.String |
getFileName() |
java.util.Map<java.lang.String,java.util.Set<java.lang.String>> |
getGenericFacets() |
java.util.Map<AnchorURL,java.lang.String> |
getHyperlinks()
List of links to resources (pages, images, files, media ...)
(Hyperlinks do not include mailto: links)
|
static java.util.Map<AnchorURL,java.lang.String> |
getHyperlinks(Document[] documents,
boolean includeNofollow) |
static java.util.Map<DigestURL,java.lang.String> |
getImagelinks(Document[] documents) |
java.util.LinkedHashMap<DigestURL,ImageEntry> |
getImages() |
java.util.Date |
getLastModified() |
java.lang.Object |
getParserObject() |
java.util.LinkedHashMap<DigestURL,java.lang.String> |
getRSS() |
java.lang.String[] |
getSectionTitles() |
java.util.List<java.lang.StringBuilder> |
getSentences(boolean pre) |
long |
getTextLength() |
java.io.InputStream |
getTextStream() |
java.lang.String |
getTextString() |
java.util.Map<AnchorURL,java.lang.String> |
getVideolinks() |
static java.util.Map<DigestURL,java.lang.String> |
getVideolinks(Document[] documents) |
int |
inboundLinkNofollowCount() |
java.util.LinkedHashMap<DigestURL,java.lang.String> |
inboundLinks() |
boolean |
indexingDenied() |
double |
lat() |
double |
lon() |
static Document |
mergeDocuments(DigestURL location,
java.lang.String globalMime,
Document[] docs)
merge documents: a helper method for all parsers that return multiple documents
|
int |
outboundLinkNofollowCount() |
java.util.LinkedHashMap<DigestURL,java.lang.String> |
outboundLinks() |
private void |
resortLinks()
sorts all links (anchors) into individual collections
|
void |
rewrite_dc_source(java.util.regex.Pattern pattern,
java.lang.String replacement)
rewrite the dc_source; this can be used for normalization purpose
|
void |
setDepth(int depth) |
void |
setFavicon(MultiProtocolURL faviconURL) |
void |
setIndexingDenied(boolean indexingDenied) |
void |
setTitle(java.lang.String title)
Sets the title of the document, replacing any existing titles.
|
java.util.List<java.lang.String> |
titles() |
java.lang.String |
toString() |
void |
writeXML(java.io.Writer os) |
private DigestURL source
private final java.lang.String mimeType
private final java.lang.String charset
private final java.util.Set<java.lang.String> keywords
private java.util.List<java.lang.String> titles
private final java.lang.StringBuilder creator
private final java.lang.String publisher
private final java.util.List<java.lang.String> sections
private final java.util.List<java.lang.String> descriptions
private java.lang.Object text
private final java.util.Collection<AnchorURL> anchors
private final java.util.LinkedHashMap<DigestURL,java.lang.String> rss
private final java.util.LinkedHashMap<DigestURL,ImageEntry> images
private java.util.LinkedHashMap<AnchorURL,java.lang.String> audiolinks
private java.util.LinkedHashMap<AnchorURL,java.lang.String> videolinks
private java.util.LinkedHashMap<AnchorURL,java.lang.String> applinks
private java.util.LinkedHashMap<AnchorURL,java.lang.String> hyperlinks
private java.util.LinkedHashMap<DigestURL,java.lang.String> inboundlinks
private java.util.LinkedHashMap<DigestURL,java.lang.String> outboundlinks
private java.util.Set<AnchorURL> emaillinks
private MultiProtocolURL favicon
private boolean resorted
private final java.util.Set<java.lang.String> languages
private boolean indexingDenied
private final double lon
private final double lat
private final java.lang.Object parserObject
private final java.util.Map<java.lang.String,java.util.Set<java.lang.String>> generic_facets
private final java.util.Date lastModified
private int crawldepth
public static final java.lang.String CANONICAL_MARKER
public static final java.lang.String IFRAME_MARKER
public static final java.lang.String FRAME_MARKER
public static final java.lang.String EMBED_MARKER
public Document(DigestURL location, java.lang.String mimeType, java.lang.String charset, java.lang.Object parserObject, java.util.Set<java.lang.String> languages, java.lang.String[] keywords, java.util.List<java.lang.String> titles, java.lang.String author, java.lang.String publisher, java.lang.String[] sections, java.util.List<java.lang.String> abstrcts, double lon, double lat, java.lang.Object text, java.util.Collection<AnchorURL> anchors, java.util.LinkedHashMap<DigestURL,java.lang.String> rss, java.util.LinkedHashMap<DigestURL,ImageEntry> images, boolean indexingDenied, java.util.Date lastModified)
public Classification.ContentDomain getContentDomain()
public java.lang.Object getParserObject()
public java.util.Set<java.lang.String> getContentLanguages()
public java.lang.String getFileName()
public java.util.Map<java.lang.String,java.util.Set<java.lang.String>> getGenericFacets()
public java.lang.String dc_language()
public java.lang.String dc_title()
public java.util.List<java.lang.String> titles()
public void setTitle(java.lang.String title)
title
- public java.lang.String dc_creator()
public void addTags(java.util.Set<java.lang.String> tags)
tags
- protected void addMetatags(java.util.Map<java.lang.String,java.util.Set<Tagging.Metatag>> tags)
tags
- a map where the key is the navigator name and the value is the set of attributes as metatagspublic static java.util.Map<java.lang.String,java.util.Set<java.lang.String>> computeGenericFacets(java.util.Map<java.lang.String,java.util.Set<Tagging.Metatag>> tags)
tags
- a map where the key is the navigator name and the value is the set of attributes as metatagspublic java.util.Set<java.lang.String> dc_subject()
public java.lang.String dc_subject(char separator)
separator
- characterpublic java.lang.String[] dc_description()
public java.lang.String dc_publisher()
public java.lang.String dc_format()
public java.lang.String dc_identifier()
public DigestURL dc_source()
public void rewrite_dc_source(java.util.regex.Pattern pattern, java.lang.String replacement)
pattern
- replacement
- public java.lang.String getCharset()
null
if unknownpublic java.lang.String[] getSectionTitles()
public java.io.InputStream getTextStream()
public java.lang.String getTextString()
public long getTextLength()
public java.util.List<java.lang.StringBuilder> getSentences(boolean pre)
public java.util.Collection<AnchorURL> getAnchors()
public java.util.LinkedHashMap<DigestURL,java.lang.String> getRSS()
public java.util.Map<AnchorURL,java.lang.String> getHyperlinks()
public java.util.Map<AnchorURL,java.lang.String> getAudiolinks()
public java.util.Map<AnchorURL,java.lang.String> getVideolinks()
public java.util.LinkedHashMap<DigestURL,ImageEntry> getImages()
public java.util.Map<AnchorURL,java.lang.String> getApplinks()
public java.util.Set<AnchorURL> getEmaillinks()
public java.util.Date getLastModified()
public double lon()
public double lat()
private void resortLinks()
public static java.util.Map<MultiProtocolURL,java.lang.String> allSubpaths(java.util.Collection<?> links)
private static java.util.Map<AnchorURL,java.lang.String> allReflinks(java.util.Collection<?> links)
public void addSubDocuments(Document[] docs) throws java.io.IOException
docs
- to be includedjava.io.IOException
mergeDocuments()
public MultiProtocolURL getFavicon()
URL
to the favicon that belongs to the documentpublic void setFavicon(MultiProtocolURL faviconURL)
faviconURL
- the URL
to the favicon that belongs to the documentpublic int inboundLinkNofollowCount()
public int outboundLinkNofollowCount()
public java.util.LinkedHashMap<DigestURL,java.lang.String> inboundLinks()
public java.util.LinkedHashMap<DigestURL,java.lang.String> outboundLinks()
public boolean indexingDenied()
public void setIndexingDenied(boolean indexingDenied)
public void setDepth(int depth)
public int getDepth()
public void writeXML(java.io.Writer os) throws java.io.IOException
java.io.IOException
public java.lang.String toString()
toString
in class java.lang.Object
public void close()
public static Document mergeDocuments(DigestURL location, java.lang.String globalMime, Document[] docs)
docs
- public static java.util.Map<AnchorURL,java.lang.String> getHyperlinks(Document[] documents, boolean includeNofollow)
public static java.util.Map<DigestURL,java.lang.String> getImagelinks(Document[] documents)
public static java.util.Map<DigestURL,java.lang.String> getAudiolinks(Document[] documents)
public static java.util.Map<DigestURL,java.lang.String> getVideolinks(Document[] documents)
public static java.util.Map<DigestURL,java.lang.String> getApplinks(Document[] documents)
private static final java.lang.String description(Document d, java.lang.String tagname)