public class ContentScraper extends AbstractScraper implements Scraper
Modifier and Type | Class and Description |
---|---|
static class |
ContentScraper.Tag |
static class |
ContentScraper.TagName |
static class |
ContentScraper.TagType |
Modifier and Type | Field and Description |
---|---|
private java.util.List<AnchorURL> |
anchors |
private java.util.List<java.lang.String> |
articles |
private ClusteredScoreMap<java.lang.String> |
bold |
private int |
breadcrumbs |
private AnchorURL |
canonical |
private static java.util.regex.Pattern |
commaSepPattern |
private CharBuffer |
content |
private java.util.LinkedHashMap<DigestURL,java.lang.String> |
css |
private java.util.List<java.lang.String> |
dd |
private char |
degree |
private static java.util.regex.Pattern |
dpssp |
private java.util.List<java.lang.String> |
dt |
private java.util.LinkedHashMap<AnchorURL,EmbedEntry> |
embeds |
private java.util.List<java.util.Date> |
endDates |
private Evaluation |
evaluationScores
evaluation scores: count appearance of specific attributes
|
private MultiProtocolURL |
favicon
MultiProtocolURL to the favicon that belongs to the document |
private java.util.Set<AnchorURL> |
frames |
private java.util.List<java.lang.String>[] |
headlines |
private java.util.Map<java.lang.String,DigestURL> |
hreflang |
private javax.swing.event.EventListenerList |
htmlFilterEventListeners |
private java.util.Set<AnchorURL> |
iframes |
private java.util.List<ImageEntry> |
images |
private ClusteredScoreMap<java.lang.String> |
italic |
private double |
lat |
private static java.util.regex.Pattern |
LB |
private java.util.List<java.lang.String> |
li |
private static java.util.Set<java.lang.String> |
linkTags0 |
private static java.util.Set<java.lang.String> |
linkTags1 |
private double |
lon |
static int |
MAX_DOCSIZE |
private static int |
MAX_TAGSIZE |
private int |
maxLinks |
private java.util.Map<java.lang.String,java.lang.String> |
metas |
private char[] |
minuteCharsHTML |
private java.util.Map<java.lang.String,DigestURL> |
navigation |
private static java.util.regex.Pattern |
protp |
private AnchorURL |
publisher |
private DigestURL |
root
The document root
MultiProtocolURL |
private java.util.LinkedHashMap<DigestURL,java.lang.String> |
rss |
private java.util.Set<AnchorURL> |
script |
private static java.util.regex.Pattern |
semicSepPattern |
private java.util.List<java.util.Date> |
startDates |
private int |
timezoneOffset |
private java.util.LinkedHashSet<java.lang.String> |
titles |
private ClusteredScoreMap<java.lang.String> |
underline |
private VocabularyScraper |
vocabularyScraper |
EMPTY_STRING, lb, rb, sl, sp
Constructor and Description |
---|
ContentScraper(DigestURL root,
int maxLinks,
VocabularyScraper vocabularyScraper,
int timezoneOffset)
scrape a document
|
Modifier and Type | Method and Description |
---|---|
private AnchorURL |
absolutePath(java.lang.String relativePath) |
int |
breadcrumbCount() |
private void |
checkOpts(ContentScraper.Tag tag) |
void |
close() |
boolean |
containsFlash() |
void |
deregisterHtmlFilterEventListener(ScraperListener listener) |
private static int |
find(java.lang.String s,
java.util.regex.Pattern m,
int start) |
void |
finish() |
private void |
fireScrapeTag0(java.lang.String tagname,
java.util.Properties tagopts) |
private void |
fireScrapeTag1(java.lang.String tagname,
java.util.Properties tagopts,
char[] text) |
boolean |
followDenied() |
java.util.List<AnchorURL> |
getAnchors() |
java.util.List<java.lang.String> |
getArticles() |
java.lang.String |
getAuthor() |
java.lang.String[] |
getBold() |
java.lang.String[] |
getBoldCount(java.lang.String[] a) |
AnchorURL |
getCanonical() |
java.util.Set<java.lang.String> |
getContentLanguages() |
java.lang.String |
getContentType() |
java.util.Map<DigestURL,java.lang.String> |
getCSS() |
java.util.Date |
getDate() |
java.lang.String[] |
getDd() |
java.util.List<java.lang.String> |
getDescriptions() |
java.lang.String[] |
getDt() |
java.util.Map<AnchorURL,EmbedEntry> |
getEmbeds() |
java.util.List<java.util.Date> |
getEndDates() |
java.util.Set<java.lang.String> |
getEvaluationModelNames()
produce all model names
|
java.lang.String[] |
getEvaluationModelScoreCounts(java.lang.String modelName,
java.lang.String[] a) |
java.lang.String[] |
getEvaluationModelScoreNames(java.lang.String modelName) |
MultiProtocolURL |
getFavicon() |
DigestURL[] |
getFlash() |
java.util.Set<AnchorURL> |
getFrames() |
java.lang.String[] |
getHeadlines(int i) |
java.util.Map<java.lang.String,DigestURL> |
getHreflang() |
java.util.Set<AnchorURL> |
getIFrames() |
java.util.List<ImageEntry> |
getImages()
get all images
|
java.lang.String[] |
getItalic() |
java.lang.String[] |
getItalicCount(java.lang.String[] a) |
java.lang.String[] |
getKeywords() |
double |
getLat() |
java.lang.String[] |
getLi() |
double |
getLon() |
java.util.Map<java.lang.String,java.lang.String> |
getMetas() |
java.util.Map<java.lang.String,DigestURL> |
getNavigation() |
java.lang.String |
getPublisher() |
DigestURL |
getPublisherLink() |
java.lang.String |
getRefreshPath() |
int |
getRefreshSeconds() |
java.util.LinkedHashMap<DigestURL,java.lang.String> |
getRSS() |
java.util.Set<AnchorURL> |
getScript() |
java.util.List<java.util.Date> |
getStartDates() |
java.lang.String |
getText() |
java.util.List<java.lang.String> |
getTitles() |
java.lang.String[] |
getUnderline() |
java.lang.String[] |
getUnderlineCount(java.lang.String[] a) |
boolean |
indexingDenied() |
static ContentScraper |
parseResource(java.io.File file,
int maxLinks,
int timezoneOffset) |
void |
print() |
private java.lang.String |
recursiveParse(AnchorURL linkurl,
char[] inlineHtml) |
void |
registerHtmlFilterEventListener(ScraperListener listener) |
void |
scrapeComment(char[] comment) |
void |
scrapeTag0(ContentScraper.Tag tag) |
void |
scrapeTag1(ContentScraper.Tag tag) |
void |
scrapeText(char[] newtext0,
java.lang.String insideTag) |
cleanLine, isTag0, isTag1, main, stripAllTags
private static final int MAX_TAGSIZE
public static final int MAX_DOCSIZE
private final char degree
private final char[] minuteCharsHTML
private static final java.util.Set<java.lang.String> linkTags0
private static final java.util.Set<java.lang.String> linkTags1
private static final java.util.regex.Pattern LB
private final java.util.List<AnchorURL> anchors
private final java.util.LinkedHashMap<DigestURL,java.lang.String> rss
private final java.util.LinkedHashMap<DigestURL,java.lang.String> css
private final java.util.LinkedHashMap<AnchorURL,EmbedEntry> embeds
private final java.util.List<ImageEntry> images
private final java.util.Set<AnchorURL> script
private final java.util.Set<AnchorURL> frames
private final java.util.Set<AnchorURL> iframes
private final java.util.Map<java.lang.String,java.lang.String> metas
private final java.util.Map<java.lang.String,DigestURL> hreflang
private final java.util.Map<java.lang.String,DigestURL> navigation
private java.util.LinkedHashSet<java.lang.String> titles
private final java.util.List<java.lang.String> articles
private final java.util.List<java.util.Date> startDates
private final java.util.List<java.util.Date> endDates
private java.util.List<java.lang.String>[] headlines
private final ClusteredScoreMap<java.lang.String> bold
private final ClusteredScoreMap<java.lang.String> italic
private final ClusteredScoreMap<java.lang.String> underline
private final java.util.List<java.lang.String> li
private final java.util.List<java.lang.String> dt
private final java.util.List<java.lang.String> dd
private final CharBuffer content
private final javax.swing.event.EventListenerList htmlFilterEventListeners
private double lon
private double lat
private AnchorURL canonical
private AnchorURL publisher
private final int maxLinks
private final VocabularyScraper vocabularyScraper
private final int timezoneOffset
private int breadcrumbs
private MultiProtocolURL favicon
MultiProtocolURL
to the favicon that belongs to the documentprivate DigestURL root
MultiProtocolURL
private final Evaluation evaluationScores
private static final java.util.regex.Pattern dpssp
private static final java.util.regex.Pattern protp
private static final java.util.regex.Pattern commaSepPattern
private static final java.util.regex.Pattern semicSepPattern
public ContentScraper(DigestURL root, int maxLinks, VocabularyScraper vocabularyScraper, int timezoneOffset)
root
- the document root urlmaxLinks
- the maximum number of links to scapreclassDetector
- a map from class names to vocabulary names to scrape content from the DOM with associated class namepublic void scrapeText(char[] newtext0, java.lang.String insideTag)
scrapeText
in interface Scraper
scrapeText
in class AbstractScraper
private static final int find(java.lang.String s, java.util.regex.Pattern m, int start)
private AnchorURL absolutePath(java.lang.String relativePath)
private void checkOpts(ContentScraper.Tag tag)
public void scrapeTag0(ContentScraper.Tag tag)
scrapeTag0
in interface Scraper
scrapeTag0
in class AbstractScraper
public void scrapeTag1(ContentScraper.Tag tag)
scrapeTag1
in interface Scraper
scrapeTag1
in class AbstractScraper
public void scrapeComment(char[] comment)
scrapeComment
in interface Scraper
private java.lang.String recursiveParse(AnchorURL linkurl, char[] inlineHtml)
public java.util.List<java.lang.String> getTitles()
public java.lang.String[] getHeadlines(int i)
public java.lang.String[] getBold()
public java.lang.String[] getBoldCount(java.lang.String[] a)
public java.lang.String[] getItalic()
public java.lang.String[] getItalicCount(java.lang.String[] a)
public java.lang.String[] getUnderline()
public java.lang.String[] getUnderlineCount(java.lang.String[] a)
public java.lang.String[] getLi()
public java.lang.String[] getDt()
public java.lang.String[] getDd()
public java.util.List<java.util.Date> getStartDates()
public java.util.List<java.util.Date> getEndDates()
public DigestURL[] getFlash()
public boolean containsFlash()
public int breadcrumbCount()
public java.lang.String getText()
public java.util.List<java.lang.String> getArticles()
public java.util.List<AnchorURL> getAnchors()
public java.util.LinkedHashMap<DigestURL,java.lang.String> getRSS()
public java.util.Map<DigestURL,java.lang.String> getCSS()
public java.util.Set<AnchorURL> getFrames()
public java.util.Set<AnchorURL> getIFrames()
public java.util.Set<AnchorURL> getScript()
public AnchorURL getCanonical()
public DigestURL getPublisherLink()
public java.util.Map<java.lang.String,DigestURL> getHreflang()
public java.util.Map<java.lang.String,DigestURL> getNavigation()
public java.util.List<ImageEntry> getImages()
public java.util.Map<AnchorURL,EmbedEntry> getEmbeds()
public java.util.Map<java.lang.String,java.lang.String> getMetas()
public MultiProtocolURL getFavicon()
MultiProtocolURL
to the favicon that belongs to the documentpublic boolean indexingDenied()
public boolean followDenied()
public java.util.List<java.lang.String> getDescriptions()
public java.lang.String getContentType()
public java.lang.String getAuthor()
public java.lang.String getPublisher()
public java.util.Set<java.lang.String> getContentLanguages()
public java.lang.String[] getKeywords()
public int getRefreshSeconds()
public java.lang.String getRefreshPath()
public java.util.Date getDate()
public double getLon()
public double getLat()
public java.util.Set<java.lang.String> getEvaluationModelNames()
public java.lang.String[] getEvaluationModelScoreNames(java.lang.String modelName)
public java.lang.String[] getEvaluationModelScoreCounts(java.lang.String modelName, java.lang.String[] a)
public void close()
close
in interface Scraper
close
in class AbstractScraper
public void print()
public void registerHtmlFilterEventListener(ScraperListener listener)
registerHtmlFilterEventListener
in interface Scraper
public void deregisterHtmlFilterEventListener(ScraperListener listener)
deregisterHtmlFilterEventListener
in interface Scraper
private void fireScrapeTag0(java.lang.String tagname, java.util.Properties tagopts)
private void fireScrapeTag1(java.lang.String tagname, java.util.Properties tagopts, char[] text)
public static ContentScraper parseResource(java.io.File file, int maxLinks, int timezoneOffset) throws java.io.IOException
java.io.IOException