Heritrix 3.1.0 소스 해석(17)

18072 단어 Heritrix
다음은 BdbFrontier 객체 void finished(CrawlURI cURI) 방법과 관련된 방법을 분석합니다.
/**

     * Note that the previously emitted CrawlURI has completed

     * its processing (for now).

     *

     * The CrawlURI may be scheduled to retry, if appropriate,

     * and other related URIs may become eligible for release

     * via the next next() call, as a result of finished().

     *

     * TODO: make as many decisions about what happens to the CrawlURI

     * (success, failure, retry) and queue (retire, snooze, ready) as 

     * possible elsewhere, such as in DispositionProcessor. Then, break

     * this into simple branches or focused methods for each case. 

     *  

     * @see org.archive.crawler.framework.Frontier#finished(org.archive.modules.CrawlURI)

     */

    protected void processFinish(CrawlURI curi) {

//        assert Thread.currentThread() == managerThread;        

        long now = System.currentTimeMillis();

        // 

        curi.incrementFetchAttempts();

        logNonfatalErrors(curi);

        

        WorkQueue wq = (WorkQueue) curi.getHolder();

        // always refresh budgeting values from current curi

        // (whose overlay settings should be active here)

        wq.setSessionBudget(getBalanceReplenishAmount());

        wq.setTotalBudget(getQueueTotalBudget());

        

        assert (wq.peek(this) == curi) : "unexpected peek " + wq;



        int holderCost = curi.getHolderCost();

        // 

        if (needsReenqueuing(curi)) {

            // codes/errors which don't consume the URI, leaving it atop queue

            if(curi.getFetchStatus()!=S_DEFERRED) {

                wq.expend(holderCost); // all retries but DEFERRED cost

            }

            // 

            long delay_ms = retryDelayFor(curi) * 1000;

            curi.processingCleanup(); // lose state that shouldn't burden retry

            wq.unpeek(curi);

            // WorkQueue wq

            wq.update(this, curi); // rewrite any changes

            // 

            handleQueue(wq,curi.includesRetireDirective(),now,delay_ms);

            appCtx.publishEvent(new CrawlURIDispositionEvent(this,curi,DEFERRED_FOR_RETRY));

            doJournalReenqueued(curi);

            wq.makeDirty();

            return; // no further dequeueing, logging, rescheduling to occur

        }



        // Curi will definitely be disposed of without retry, so remove from queue

        // WorkQueue wq CrawlURI curi 

        wq.dequeue(this,curi);

        decrementQueuedCount(1);

        largestQueues.update(wq.getClassKey(), wq.getCount());

        log(curi);

        

        if (curi.isSuccess()) {

            // codes deemed 'success' 

            incrementSucceededFetchCount();

            totalProcessedBytes.addAndGet(curi.getRecordedSize());

            appCtx.publishEvent(new CrawlURIDispositionEvent(this,curi,SUCCEEDED));

            doJournalFinishedSuccess(curi);

           

        } else if (isDisregarded(curi)) {

            // codes meaning 'undo' (even though URI was enqueued, 

            // we now want to disregard it from normal success/failure tallies)

            // (eg robots-excluded, operator-changed-scope, etc)

            incrementDisregardedUriCount();

            appCtx.publishEvent(new CrawlURIDispositionEvent(this,curi,DISREGARDED));

            holderCost = 0; // no charge for disregarded URIs

            // TODO: consider reinstating forget-URI capability, so URI could be

            // re-enqueued if discovered again

            doJournalDisregarded(curi);

            

        } else {

            // codes meaning 'failure'

            incrementFailedFetchCount();

            appCtx.publishEvent(new CrawlURIDispositionEvent(this,curi,FAILED));

            // if exception, also send to crawlErrors

            if (curi.getFetchStatus() == S_RUNTIME_EXCEPTION) {

                Object[] array = { curi };

                loggerModule.getRuntimeErrors().log(Level.WARNING, curi.getUURI()

                        .toString(), array);

            }        

            // charge queue any extra error penalty

            wq.noteError(getErrorPenaltyAmount());

            doJournalFinishedFailure(curi);

            

        }



        wq.expend(holderCost); // successes & failures charge cost to queue

        // 

        long delay_ms = curi.getPolitenessDelay();

        //long delay_ms = 0;

        // 

        handleQueue(wq,curi.includesRetireDirective(),now,delay_ms);

        wq.makeDirty();

        

        if(curi.getRescheduleTime()>0) {

            // marked up for forced-revisit at a set time

            curi.processingCleanup();

            curi.resetForRescheduling(); 

            futureUris.put(curi.getRescheduleTime(),curi);

            futureUriCount.incrementAndGet(); 

        } else {

            curi.stripToMinimal();

            curi.processingCleanup();

        }

    }

먼저 CrawlURI curi 객체를 대기열에 다시 배치해야 하는지 판단하는 방법은 다음과 같습니다.
/**

     * Checks if a recently processed CrawlURI that did not finish successfully

     * needs to be reenqueued (and thus possibly, processed again after some 

     * time elapses)

     * 

     * @param curi

     *            The CrawlURI to check

     * @return True if we need to retry.

     */

    protected boolean needsReenqueuing(CrawlURI curi) {

        // , 30 

        if (overMaxRetries(curi)) {

            return false;

        }

        // 

        switch (curi.getFetchStatus()) {

        case HttpStatus.SC_UNAUTHORIZED:

            // We can get here though usually a positive status code is

            // a success. We get here if there is rfc2617 credential data

            // loaded and we're supposed to go around again. See if any

            // rfc2617 credential present and if there, assume it got

            // loaded in FetchHTTP on expectation that we're to go around

            // again. If no rfc2617 loaded, we should not be here.

            boolean loaded = curi.hasRfc2617Credential();

            if (!loaded && logger.isLoggable(Level.FINE)) {

                logger.fine("Have 401 but no creds loaded " + curi);

            }

            return loaded;

        case S_DEFERRED:

        case S_CONNECT_FAILED:

        case S_CONNECT_LOST:

        case S_DOMAIN_UNRESOLVABLE:

            // these are all worth a retry

            // TODO: consider if any others (S_TIMEOUT in some cases?) deserve

            // retry

            return true;

        case S_UNATTEMPTED:

            if(curi.includesRetireDirective()) {

                return true;

            } // otherwise, fall-through: no status is an error without queue-directive

        default:

            return false;

        }

    }

long retryDelayFor(CrawlURI curi) 방법은 WorkQueue wq 지연 시간을 설정합니다.
/**

     * Return a suitable value to wait before retrying the given URI.

     * 

     * @param curi

     *            CrawlURI to be retried

     * @return millisecond delay before retry

     */

    protected long retryDelayFor(CrawlURI curi) {

        int status = curi.getFetchStatus();

        return (status == S_CONNECT_FAILED || status == S_CONNECT_LOST ||

                status == S_DOMAIN_UNRESOLVABLE)? getRetryDelaySeconds() : 0;

                // no delay for most

    }

getRetry DelaySeconds () 의 값은 기본적으로 900초 (15분) 이다
다음에 CrawlURI curi 대상을 WorkQueue wq로 업데이트하기 위해 마지막으로 WorkQueue wq의 대기열 귀속을 리셋합니다. (더 이상 활성화되지 않은 대기열이나 휴면 대기열이나 Reenqueue Queue (wq) 에 넣고 처리합니다.)
/**

     *  WorkQueue wq 

     * Send an active queue to its next state, based on the supplied 

     * parameters.

     * 

     * @param wq

     * @param forceRetire

     * @param now

     * @param delay_ms

     */

    protected void handleQueue(WorkQueue wq, boolean forceRetire, long now, long delay_ms) {

        

        inProcessQueues.remove(wq);

        if(forceRetire) {

            retireQueue(wq);

        } else if (delay_ms > 0) {

            snoozeQueue(wq, now, delay_ms);

        } else {

            //Enqueue the given queue to either readyClassQueues or inactiveQueues,as appropriate

            reenqueueQueue(wq);

        }

    }

다음은 다음 방법을 봅시다.dequeue(this, curi)는 WorkQueue wq에서 CrawlURI curi 객체를 제거합니다.
마지막으로 WorkQueue wq의 대기열 귀속을 초기화합니다
 long delay_ms = curi.getPolitenessDelay(); 

 handleQueue(wq,curi.includesRetireDirective(),now,delay_ms);

handleQueue 방법은 위에 있어요.
---------------------------------------------------------------------------
본 시리즈의 Heritrix 3.1.0 원본 해석은 본인이 창작한 것입니다.
전재 는 출처 가 블로그 정원 고슴도치 의 온순함 을 밝혀 주십시오
본문 링크http://www.cnblogs.com/chenying99/archive/2013/04/21/3033520.html

좋은 웹페이지 즐겨찾기