Heritrix 3.1.0 소스 해석 (29)
26821 단어 Heritrix
Crawl Server 대상은 서버를 대표하고 서버에 대한 정보를 저장합니다. 서비스명 포트 로봇스 정보Credential 집합 및 관련 조작 등을 포함합니다.
private static final long serialVersionUID = 3L;
public static final long ROBOTS_NOT_FETCHED = -1;
/** only check if robots-fetch is perhaps superfluous
* after this many tries */
public static final long MIN_ROBOTS_RETRIES = 3;
private String server; // actually, host+port in the https case
private int port;
protected Robotstxt robotstxt;
long robotsFetched = ROBOTS_NOT_FETCHED;
boolean validRobots = false;
FetchStats substats = new FetchStats();
// how many consecutive connection errors have been encountered;
// used to drive exponentially increasing retry timeout or decision
// to 'freeze' entire class (queue) of URIs
protected int consecutiveConnectionErrors = 0;
/**
* Set of credentials.
*/
private transient Set<Credential> credentials = null;
String server는 사이트 서버의 표식을 나타내는데 그 구조 방법은 다음과 같다(사이트 서버의 표식과 포트를 초기화한다)
/**
* Creates a new CrawlServer object.
*
* @param h the host string for the server.
*/
public CrawlServer(String h) {
// TODO: possibly check for illegal host string
server = h;
int colonIndex = server.lastIndexOf(":");
if (colonIndex < 0) {
port = -1;
} else {
try {
port = Integer.parseInt(server.substring(colonIndex + 1));
} catch (NumberFormatException e) {
port = -1;
}
}
}
다음 방법은 Robotstxt robotstxt 대상 조작에 관한 것입니다
public Robotstxt getRobotstxt() {
return robotstxt;
}
/** Update the robotstxt
*
* @param curi the crawl URI containing the fetched robots.txt
* @throws IOException
*/
public synchronized void updateRobots(CrawlURI curi) {
robotsFetched = System.currentTimeMillis();
boolean gotSomething = curi.getFetchType() == HTTP_GET
&& (curi.getFetchStatus() > 0 || curi.getFetchStatus() == S_DEEMED_NOT_FOUND);
if (!gotSomething && curi.getFetchAttempts() < MIN_ROBOTS_RETRIES) {
// robots.txt lookup failed, still trying, no reason to consider IGNORE yet
validRobots = false;
return;
}
// special deeming for a particular kind of connection-lost (empty server response)
if (curi.getFetchStatus() == S_CONNECT_LOST
&& CollectionUtils.exists(curi.getNonFatalFailures(),
PredicateUtils.instanceofPredicate(NoHttpResponseException.class))) {
curi.setFetchStatus(S_DEEMED_NOT_FOUND);
gotSomething = true;
}
if (!gotSomething) {
// robots.txt fetch failed and exceptions (ignore/deeming) don't apply; no valid robots info yet
validRobots = false;
return;
}
int fetchStatus = curi.getFetchStatus();
if (fetchStatus < 200 || fetchStatus >= 300) {
// Not found or anything but a status code in the 2xx range is
// treated as giving access to all of a sites' content.
// This is the prevailing practice of Google, since 4xx
// responses on robots.txt are usually indicative of a
// misconfiguration or blanket-block, not an intentional
// indicator of partial blocking.
// TODO: consider handling server errors, redirects differently
robotstxt = Robotstxt.NO_ROBOTS;
validRobots = true;
return;
}
InputStream contentBodyStream = null;
try {
BufferedReader reader;
contentBodyStream = curi.getRecorder().getContentReplayInputStream();
reader = new BufferedReader(new InputStreamReader(contentBodyStream));
robotstxt = new Robotstxt(reader);
validRobots = true;
} catch (IOException e) {
robotstxt = Robotstxt.NO_ROBOTS;
logger.log(Level.WARNING,"problem reading robots.txt for "+curi,e);
validRobots = true;
curi.getNonFatalFailures().add(e);
} finally {
IOUtils.closeQuietly(contentBodyStream);
}
}
/**
* If true then valid robots.txt information has been retrieved. If false
* either no attempt has been made to fetch robots.txt or the attempt
* failed.
*
* @return Returns the validRobots.
*/
public synchronized boolean isValidRobots() {
return validRobots;
}
/**
* Is the robots policy expired.
*
* This method will also return true if we haven't tried to get the
* robots.txt for this server.
*
* @param curi
* @return true if the robots policy is expired.
*/
public synchronized boolean isRobotsExpired(int validityDuration) {
if (robotsFetched == ROBOTS_NOT_FETCHED) {
// Have not attempted to fetch robots
return true;
}
long duration = validityDuration*1000L;
if (duration == 0) {
// When zero, robots should be valid forever
return false;
}
if (robotsFetched + duration < System.currentTimeMillis()) {
// Robots is still valid
return true;
}
return false;
}
Set
/**
* @return Credential avatars for this server. Returns null if none.
*/
public Set<Credential> getCredentials() {
return this.credentials;
}
/**
* @return True if there are avatars attached to this instance.
*/
public boolean hasCredentials() {
return this.credentials != null && this.credentials.size() > 0;
}
/**
* Add an avatar.
*
* @param ca Credential avatar to add to set of avatars.
*/
public void addCredential(Credential cred) {
if (this.credentials == null) {
this.credentials = new HashSet<Credential>();
}
this.credentials.add(cred);
}
UURI Uuri 객체에 따라 키를 생성하는 정적 방법(사이트 서버 ID용)
/**
* Get key to use doing lookup on server instances.
*
* @param cauri CandidateURI we're to get server key for.
* @return String to use as server key.
* @throws URIException
*/
/**
* UURI uuri key
* key classkey, url key
* @param uuri
* @return
* @throws URIException
*/
public static String getServerKey(UURI uuri) throws URIException {
// TODO: evaluate if this is really necessary -- why not
// make the server of a dns CandidateURI the looked-up domain,
// also simplifying FetchDNS?
String key = uuri.getAuthorityMinusUserinfo();
if (key == null) {
// Fallback for cases where getAuthority() fails (eg 'dns:'.
// DNS UURIs have the 'domain' in the 'path' parameter, not
// in the authority).
key = uuri.getCurrentHierPath();
if (key != null && !key.matches("[-_\\w\\.:]+")) {
// Not just word chars and dots and colons and dashes and
// underscores; throw away
key = null;
}
}
if (key != null && uuri.getScheme().equals(UURIFactory.HTTPS)) {
// If https and no port specified, add default https port to
// distinuish https from http server without a port.
if (!key.matches(".+:[0-9]+")) {
key += UURIFactory.HTTPS_PORT;
}
}
return key;
}
CrawlHost 대상은 호스트를 대표하는데 호스트 표지(도메인명) IP 주소 캡처 시간 국가 코드 정보 등이 저장되어 있다
/** Flag value indicating always-valid IP */
public static final long IP_NEVER_EXPIRES = -1;
/** Flag value indicating an IP has not yet been looked up */
public static final long IP_NEVER_LOOKED_UP = -2;
private String hostname;
private String countryCode;
private InetAddress ip;
private long ipFetched = IP_NEVER_LOOKED_UP;
protected FetchStats substats = new FetchStats();
/**
* TTL gotten from dns record.
*
* From rfc2035:
* <pre>
* TTL a 32 bit unsigned integer that specifies the time
* interval (in seconds) that the resource record may be
* cached before it should be discarded. Zero values are
* interpreted to mean that the RR can only be used for the
* transaction in progress, and should not be cached.
* </pre>
*/
private long ipTTL = IP_NEVER_LOOKED_UP;
// Used when bandwith constraint are used
private long earliestNextURIEmitTime = 0;
구조 방법 호스트 표식 초기화
/**
* Create a new CrawlHost object.
*
* @param hostname the host name for this host.
*/
public CrawlHost(String hostname) {
this(hostname, null);
}
/**
* Create a new CrawlHost object.
*
* @param hostname the host name for this host.
* @param countryCode the country code for this host.
*/
public CrawlHost(String hostname, String countryCode) {
this.hostname = hostname;
this.countryCode = countryCode;
InetAddress tmp = InetAddressUtil.getIPHostAddress(hostname);
if (tmp != null) {
setIP(tmp, IP_NEVER_EXPIRES);
}
}
다음 방법은 IP 주소를 설정하는 데 사용됩니다.
/** Return true if the IP for this host has been looked up.
*
* Returns true even if the lookup failed.
*
* @return true if the IP for this host has been looked up.
*/
public boolean hasBeenLookedUp() {
return ipFetched != IP_NEVER_LOOKED_UP;
}
/**
* Set the IP address for this host.
*
* @param address
* @param ttl the TTL from the dns record in seconds or -1 if it should live
* forever (is a numeric IP).
*/
/**
* IP FetchNDS IP
* @param address
* @param ttl
*/
public void setIP(InetAddress address, long ttl) {
this.ip = address;
// Assume that a lookup as occurred by the time
// a caller decides to set this (even to null)
this.ipFetched = System.currentTimeMillis();
this.ipTTL = ttl;
if (logger.isLoggable(Level.FINE)) {
logger.fine(hostname + ": " +
((address != null)? address.toString(): "null"));
}
}
---------------------------------------------------------------------------
본 시리즈의 Heritrix 3.1.0 원본 해석은 본인이 창작한 것입니다.
전재 는 출처 가 블로그 정원 고슴도치 의 온순함 을 밝혀 주십시오
본문 링크http://www.cnblogs.com/chenying99/archive/2013/04/29/3050940.html
이 내용에 흥미가 있습니까?
현재 기사가 여러분의 문제를 해결하지 못하는 경우 AI 엔진은 머신러닝 분석(스마트 모델이 방금 만들어져 부정확한 경우가 있을 수 있음)을 통해 가장 유사한 기사를 추천합니다:
Heritrix 3.1.0 소스 해석(16)다음은 BdbFrontier 객체 CrawlURI next() 방법과 관련된 방법을 분석합니다. 이 방법은 좀 길어요. 먼저void wakeQueues() 방법을 볼게요. snoozedClassQueues.poll ...
텍스트를 자유롭게 공유하거나 복사할 수 있습니다.하지만 이 문서의 URL은 참조 URL로 남겨 두십시오.
CC BY-SA 2.5, CC BY-SA 3.0 및 CC BY-SA 4.0에 따라 라이센스가 부여됩니다.