Heritrix 3.1.0 소스 해석(17)
18072 단어 Heritrix
/**
* Note that the previously emitted CrawlURI has completed
* its processing (for now).
*
* The CrawlURI may be scheduled to retry, if appropriate,
* and other related URIs may become eligible for release
* via the next next() call, as a result of finished().
*
* TODO: make as many decisions about what happens to the CrawlURI
* (success, failure, retry) and queue (retire, snooze, ready) as
* possible elsewhere, such as in DispositionProcessor. Then, break
* this into simple branches or focused methods for each case.
*
* @see org.archive.crawler.framework.Frontier#finished(org.archive.modules.CrawlURI)
*/
protected void processFinish(CrawlURI curi) {
// assert Thread.currentThread() == managerThread;
long now = System.currentTimeMillis();
//
curi.incrementFetchAttempts();
logNonfatalErrors(curi);
WorkQueue wq = (WorkQueue) curi.getHolder();
// always refresh budgeting values from current curi
// (whose overlay settings should be active here)
wq.setSessionBudget(getBalanceReplenishAmount());
wq.setTotalBudget(getQueueTotalBudget());
assert (wq.peek(this) == curi) : "unexpected peek " + wq;
int holderCost = curi.getHolderCost();
//
if (needsReenqueuing(curi)) {
// codes/errors which don't consume the URI, leaving it atop queue
if(curi.getFetchStatus()!=S_DEFERRED) {
wq.expend(holderCost); // all retries but DEFERRED cost
}
//
long delay_ms = retryDelayFor(curi) * 1000;
curi.processingCleanup(); // lose state that shouldn't burden retry
wq.unpeek(curi);
// WorkQueue wq
wq.update(this, curi); // rewrite any changes
//
handleQueue(wq,curi.includesRetireDirective(),now,delay_ms);
appCtx.publishEvent(new CrawlURIDispositionEvent(this,curi,DEFERRED_FOR_RETRY));
doJournalReenqueued(curi);
wq.makeDirty();
return; // no further dequeueing, logging, rescheduling to occur
}
// Curi will definitely be disposed of without retry, so remove from queue
// WorkQueue wq CrawlURI curi
wq.dequeue(this,curi);
decrementQueuedCount(1);
largestQueues.update(wq.getClassKey(), wq.getCount());
log(curi);
if (curi.isSuccess()) {
// codes deemed 'success'
incrementSucceededFetchCount();
totalProcessedBytes.addAndGet(curi.getRecordedSize());
appCtx.publishEvent(new CrawlURIDispositionEvent(this,curi,SUCCEEDED));
doJournalFinishedSuccess(curi);
} else if (isDisregarded(curi)) {
// codes meaning 'undo' (even though URI was enqueued,
// we now want to disregard it from normal success/failure tallies)
// (eg robots-excluded, operator-changed-scope, etc)
incrementDisregardedUriCount();
appCtx.publishEvent(new CrawlURIDispositionEvent(this,curi,DISREGARDED));
holderCost = 0; // no charge for disregarded URIs
// TODO: consider reinstating forget-URI capability, so URI could be
// re-enqueued if discovered again
doJournalDisregarded(curi);
} else {
// codes meaning 'failure'
incrementFailedFetchCount();
appCtx.publishEvent(new CrawlURIDispositionEvent(this,curi,FAILED));
// if exception, also send to crawlErrors
if (curi.getFetchStatus() == S_RUNTIME_EXCEPTION) {
Object[] array = { curi };
loggerModule.getRuntimeErrors().log(Level.WARNING, curi.getUURI()
.toString(), array);
}
// charge queue any extra error penalty
wq.noteError(getErrorPenaltyAmount());
doJournalFinishedFailure(curi);
}
wq.expend(holderCost); // successes & failures charge cost to queue
//
long delay_ms = curi.getPolitenessDelay();
//long delay_ms = 0;
//
handleQueue(wq,curi.includesRetireDirective(),now,delay_ms);
wq.makeDirty();
if(curi.getRescheduleTime()>0) {
// marked up for forced-revisit at a set time
curi.processingCleanup();
curi.resetForRescheduling();
futureUris.put(curi.getRescheduleTime(),curi);
futureUriCount.incrementAndGet();
} else {
curi.stripToMinimal();
curi.processingCleanup();
}
}
먼저 CrawlURI curi 객체를 대기열에 다시 배치해야 하는지 판단하는 방법은 다음과 같습니다.
/**
* Checks if a recently processed CrawlURI that did not finish successfully
* needs to be reenqueued (and thus possibly, processed again after some
* time elapses)
*
* @param curi
* The CrawlURI to check
* @return True if we need to retry.
*/
protected boolean needsReenqueuing(CrawlURI curi) {
// , 30
if (overMaxRetries(curi)) {
return false;
}
//
switch (curi.getFetchStatus()) {
case HttpStatus.SC_UNAUTHORIZED:
// We can get here though usually a positive status code is
// a success. We get here if there is rfc2617 credential data
// loaded and we're supposed to go around again. See if any
// rfc2617 credential present and if there, assume it got
// loaded in FetchHTTP on expectation that we're to go around
// again. If no rfc2617 loaded, we should not be here.
boolean loaded = curi.hasRfc2617Credential();
if (!loaded && logger.isLoggable(Level.FINE)) {
logger.fine("Have 401 but no creds loaded " + curi);
}
return loaded;
case S_DEFERRED:
case S_CONNECT_FAILED:
case S_CONNECT_LOST:
case S_DOMAIN_UNRESOLVABLE:
// these are all worth a retry
// TODO: consider if any others (S_TIMEOUT in some cases?) deserve
// retry
return true;
case S_UNATTEMPTED:
if(curi.includesRetireDirective()) {
return true;
} // otherwise, fall-through: no status is an error without queue-directive
default:
return false;
}
}
long retryDelayFor(CrawlURI curi) 방법은 WorkQueue wq 지연 시간을 설정합니다.
/**
* Return a suitable value to wait before retrying the given URI.
*
* @param curi
* CrawlURI to be retried
* @return millisecond delay before retry
*/
protected long retryDelayFor(CrawlURI curi) {
int status = curi.getFetchStatus();
return (status == S_CONNECT_FAILED || status == S_CONNECT_LOST ||
status == S_DOMAIN_UNRESOLVABLE)? getRetryDelaySeconds() : 0;
// no delay for most
}
getRetry DelaySeconds () 의 값은 기본적으로 900초 (15분) 이다
다음에 CrawlURI curi 대상을 WorkQueue wq로 업데이트하기 위해 마지막으로 WorkQueue wq의 대기열 귀속을 리셋합니다. (더 이상 활성화되지 않은 대기열이나 휴면 대기열이나 Reenqueue Queue (wq) 에 넣고 처리합니다.)
/**
* WorkQueue wq
* Send an active queue to its next state, based on the supplied
* parameters.
*
* @param wq
* @param forceRetire
* @param now
* @param delay_ms
*/
protected void handleQueue(WorkQueue wq, boolean forceRetire, long now, long delay_ms) {
inProcessQueues.remove(wq);
if(forceRetire) {
retireQueue(wq);
} else if (delay_ms > 0) {
snoozeQueue(wq, now, delay_ms);
} else {
//Enqueue the given queue to either readyClassQueues or inactiveQueues,as appropriate
reenqueueQueue(wq);
}
}
다음은 다음 방법을 봅시다.dequeue(this, curi)는 WorkQueue wq에서 CrawlURI curi 객체를 제거합니다.
마지막으로 WorkQueue wq의 대기열 귀속을 초기화합니다
long delay_ms = curi.getPolitenessDelay();
handleQueue(wq,curi.includesRetireDirective(),now,delay_ms);
handleQueue 방법은 위에 있어요.
---------------------------------------------------------------------------
본 시리즈의 Heritrix 3.1.0 원본 해석은 본인이 창작한 것입니다.
전재 는 출처 가 블로그 정원 고슴도치 의 온순함 을 밝혀 주십시오
본문 링크http://www.cnblogs.com/chenying99/archive/2013/04/21/3033520.html
이 내용에 흥미가 있습니까?
현재 기사가 여러분의 문제를 해결하지 못하는 경우 AI 엔진은 머신러닝 분석(스마트 모델이 방금 만들어져 부정확한 경우가 있을 수 있음)을 통해 가장 유사한 기사를 추천합니다:
Heritrix 3.1.0 소스 해석(16)다음은 BdbFrontier 객체 CrawlURI next() 방법과 관련된 방법을 분석합니다. 이 방법은 좀 길어요. 먼저void wakeQueues() 방법을 볼게요. snoozedClassQueues.poll ...
텍스트를 자유롭게 공유하거나 복사할 수 있습니다.하지만 이 문서의 URL은 참조 URL로 남겨 두십시오.
CC BY-SA 2.5, CC BY-SA 3.0 및 CC BY-SA 4.0에 따라 라이센스가 부여됩니다.