Heritrix 3.1.0 소스 해석(16)

27863 단어 Heritrix
다음은 BdbFrontier 객체 CrawlURI next() 방법과 관련된 방법을 분석합니다.
/**

     * Return the next CrawlURI eligible to be processed (and presumably

     * visited/fetched) by a a worker thread.

     *

     * Relies on the readyClassQueues having been loaded with

     * any work queues that are eligible to provide a URI. 

     *

     * @return next CrawlURI eligible to be processed, or null if none available

     *

     * @see org.archive.crawler.framework.Frontier#next()

     */

    protected CrawlURI findEligibleURI() {

            // wake any snoozed queues

            wakeQueues();

            // consider rescheduled URIS

            checkFutures();

                   

            // find a non-empty ready queue, if any 

            // TODO: refactor to untangle these loops, early-exits, etc!

            WorkQueue readyQ = null;

            findauri: while(true) {

                findaqueue: do {

                    String key = readyClassQueues.poll();

                    if(key==null) {

                        // no ready queues; try to activate one

                        if(!getInactiveQueuesByPrecedence().isEmpty() 

                            && highestPrecedenceWaiting < getPrecedenceFloor()) {

                            activateInactiveQueue();

                            continue findaqueue;

                        } else {

                            // nothing ready or readyable

                            break findaqueue;

                        }

                    }

                    readyQ = getQueueFor(key);

                    if(readyQ==null) {

                         // readyQ key wasn't in all queues: unexpected

                        logger.severe("Key "+ key +

                            " in readyClassQueues but not allQueues");

                        break findaqueue;

                    }

                    if(readyQ.getCount()==0) {

                        // readyQ is empty and ready: it's exhausted

                        readyQ.noteExhausted(); 

                        readyQ.makeDirty();

                        readyQ = null;

                        continue; 

                    }

                    if(!inProcessQueues.add(readyQ)) {

                        // double activation; discard this and move on

                        // (this guard allows other enqueuings to ready or 

                        // the various inactive-by-precedence queues to 

                        // sometimes redundantly enqueue a queue key)

                        readyQ = null; 

                        continue;

                    }

                    // queue has gone 'in process' 

                    readyQ.considerActive();

                    readyQ.setWakeTime(0); // clear obsolete wake time, if any



                    readyQ.setSessionBudget(getBalanceReplenishAmount());

                    readyQ.setTotalBudget(getQueueTotalBudget()); 

                    if (readyQ.isOverSessionBudget()) {

                        deactivateQueue(readyQ);

                        readyQ.makeDirty();

                        readyQ = null;

                        continue; 

                    }

                    if (readyQ.isOverTotalBudget()) {

                        retireQueue(readyQ);

                        readyQ.makeDirty();

                        readyQ = null;

                        continue; 

                    }

                } while (readyQ == null);

                

                if (readyQ == null) {

                    // no queues left in ready or readiable

                    break findauri; 

                }

           

                returnauri: while(true) { // loop left by explicit return or break on empty

                    CrawlURI curi = null;

                    curi = readyQ.peek(this);   

                    if(curi == null) {

                        // should not reach

                        logger.severe("No CrawlURI from ready non-empty queue "

                                + readyQ.classKey + "
" + readyQ.shortReportLegend() + "
" + readyQ.shortReportLine() + "
"); break returnauri; } // from queues, override names persist but not map source curi.setOverlayMapsSource(sheetOverlaysManager); // TODO: consider optimizations avoiding this recalc of // overrides when not necessary sheetOverlaysManager.applyOverlaysTo(curi); // check if curi belongs in different queue String currentQueueKey; try { KeyedProperties.loadOverridesFrom(curi); currentQueueKey = getClassKey(curi); } finally { KeyedProperties.clearOverridesFrom(curi); } if (currentQueueKey.equals(curi.getClassKey())) { // curi was in right queue, emit noteAboutToEmit(curi, readyQ); return curi; } // URI's assigned queue has changed since it // was queued (eg because its IP has become // known). Requeue to new queue. // TODO: consider synchronization on readyQ readyQ.dequeue(this,curi); doJournalRelocated(curi); curi.setClassKey(currentQueueKey); decrementQueuedCount(1); curi.setHolderKey(null); sendToQueue(curi); if(readyQ.getCount()==0) { // readyQ is empty and ready: it's exhausted // release held status, allowing any subsequent // enqueues to again put queue in ready // FIXME: tiny window here where queue could // receive new URI, be readied, fail not-in-process? inProcessQueues.remove(readyQ); readyQ.noteExhausted(); readyQ.makeDirty(); readyQ = null; continue findauri; } } } if(inProcessQueues.size()==0) { // Nothing was ready or in progress or imminent to wake; ensure // any piled-up pending-scheduled URIs are considered uriUniqFilter.requestFlush(); } // if truly nothing ready, wait a moment before returning null // so that loop in surrounding next() has a chance of getting something // next time if(getTotalEligibleInactiveQueues()==0) { try { Thread.sleep(1000); } catch (InterruptedException e) { // } } // nothing eligible return null; }

이 방법은 좀 길어요. 먼저void wakeQueues() 방법을 볼게요.
     /**  snoozed queue 

     * Wake any queues sitting in the snoozed queue whose time has come.

     */

    protected void wakeQueues() {

        DelayedWorkQueue waked; 

        while((waked = snoozedClassQueues.poll())!=null) {

            WorkQueue queue = waked.getWorkQueue(this);

            queue.setWakeTime(0);

            queue.makeDirty();

            reenqueueQueue(queue);

        }

        // also consider overflow (usually empty)

        if(!snoozedOverflow.isEmpty()) {

            synchronized(snoozedOverflow) {

                Iterator<DelayedWorkQueue> iter = 

                    snoozedOverflow.headMap(System.currentTimeMillis()).values().iterator();

                while(iter.hasNext()) {

                    DelayedWorkQueue dq = iter.next();

                    iter.remove();

                    snoozedOverflowCount.decrementAndGet();

                    WorkQueue queue = dq.getWorkQueue(this);

                    queue.setWakeTime(0);

                    queue.makeDirty();

                    reenqueueQueue(queue);

                }

            }

        }

    }

snoozedClassQueues.poll () 방법은 휴면 대기열에서 만료된 요소를 추출하고 수면 시간을 0으로 리셋한 다음WorkQueue wq의 대기열 귀속(비활성 상태 대기열 또는 이미 준비된 대기열)을 리셋합니다
/**

     * Enqueue the given queue to either readyClassQueues or inactiveQueues,

     * as appropriate.

     * 

     * @param wq

     */

    protected void reenqueueQueue(WorkQueue wq) { 

        //TODO:SPRINGY set overrides by queue? 

        getQueuePrecedencePolicy().queueReevaluate(wq);

        if (logger.isLoggable(Level.FINE)) {

            logger.fine("queue reenqueued: " +

                wq.getClassKey());

        }

        if(highestPrecedenceWaiting < wq.getPrecedence() 

            || wq.getPrecedence() >= getPrecedenceFloor()) {

            // if still over budget, deactivate

            deactivateQueue(wq);

        } else {

            readyQueue(wq);

        }

    }

우선 대기열의 우선 순위를 리셋하고, 그 다음에WorkQueue wq를 비활성 대기열에 귀속시키거나, 이미 추출될 준비가 된 대기열에 귀속시킵니다
deactivateQueue(wq) 방법은 위에서 분석한 바와 같이 (WorkQueue wq를 비활성 상태 대기열에 추가) readyQueue(wq) 방법을 보십시오.
/**

     * Put the given queue on the readyClassQueues queue

     * @param wq

     */

    protected void readyQueue(WorkQueue wq) {

//        assert Thread.currentThread() == managerThread;



        try {

            readyClassQueues.put(wq.getClassKey());

            if(logger.isLoggable(Level.FINE)) {

                logger.log(Level.FINE,

                        "queue readied: " + wq.getClassKey());

            }

        } catch (InterruptedException e) {

            e.printStackTrace();

            System.err.println("unable to ready queue "+wq);

            // propagate interrupt up 

            throw new RuntimeException(e);

        }

    }

이 방법은 WorkQueue wq를 이미 준비된 대기열에 추가하는 것입니다. ReadyClassQueues
void wakeQueues () 방법으로 다시 돌아가면, 다음은snoozed Overflow 용기에서 휴면이 만료된 대기열을 꺼냅니다. (snoozed Overflow는 Map 형식으로 우선순위와 과부하된 휴면 상태를 저장하는 대기열 (대기열은 키를 저장합니다) [맵 형식]))) 을 리셋한 다음WorkQueue wq를 리셋해서 어느 대기열에 귀속시킵니까?
CrawlUri findEligibleUri () 방법에 있는 void checkFutures () 방법으로 돌아가서 지연 시간의 CrawlUri 대상을 검출하고 BDB 데이터베이스에 가입합니다
/**

     * Check for any future-scheduled URIs now eligible for reenqueuing

     */

    protected void checkFutures() {

//        assert Thread.currentThread() == managerThread;

        // TODO: consider only checking this every set interval

        if(!futureUris.isEmpty()) {

            synchronized(futureUris) {

                Iterator<CrawlURI> iter = 

                    futureUris.headMap(System.currentTimeMillis())

                        .values().iterator();

                while(iter.hasNext()) {

                    CrawlURI curi = iter.next();

                    curi.setRescheduleTime(-1); // unless again set elsewhere

                    iter.remove();

                    futureUriCount.decrementAndGet();

                    receive(curi);

                }

            }

        }

    }

계속 아래를 보세요, String key = ready ClassQueues.poll () 방법은 이미 준비된 대기열readyClassQueues에서 헤더 요소를 꺼냅니다 (WorkQueue wq의classkey)
예비 대기열에 요소가 존재하지 않으면 비활성 상태 대기열 inactiveQueues를 활성화하고, 적합한 WorkQueue wq를 추출할 준비가 된readyClassQueues에 넣습니다
activateInactiveQueue()
/**

     *  

     * Activate an inactive queue, if any are available. 

     */

    protected boolean activateInactiveQueue() {

        for (Entry<Integer, Queue<String>> entry: getInactiveQueuesByPrecedence().entrySet()) {

            int expectedPrecedence = entry.getKey();

            Queue<String> queueOfWorkQueueKeys = entry.getValue();



            while (true) {

                synchronized (getInactiveQueuesByPrecedence()) {

                    String workQueueKey = queueOfWorkQueueKeys.poll();

                    if (workQueueKey == null) {

                        break;

                    }



                    WorkQueue candidateQ = (WorkQueue) this.allQueues.get(workQueueKey);

                    if (candidateQ.getPrecedence() > expectedPrecedence) {

                        // queue demoted since placed; re-deactivate

                        deactivateQueue(candidateQ);

                        candidateQ.makeDirty();

                        continue; 

                    }



                    updateHighestWaiting(expectedPrecedence);

                    try {

                        readyClassQueues.put(workQueueKey);//readyClassQueues key

                    } catch (InterruptedException e) {

                        throw new RuntimeException(e); 

                    } 

                    

                    return true; 

                }

            }

        }

        

        return false;

    }

비활성 상태 대기열 inactiveQueues에서 가장 높은 우선 순위 값 업데이트(최소값)
/**

     * Recalculate the value of thehighest-precedence queue waiting

     * among inactive queues. 

     * 

     * @param startFrom start looking at this precedence value

     */

    protected void updateHighestWaiting(int startFrom) {

        // probe for new highestWaiting

        for(int precedenceKey : getInactiveQueuesByPrecedence().tailMap(startFrom).keySet()) {

            if(!getInactiveQueuesByPrecedence().get(precedenceKey).isEmpty()) {

                highestPrecedenceWaiting = precedenceKey;

                return;

            }

        }

        // nothing waiting

        highestPrecedenceWaiting = Integer.MAX_VALUE;

    }

위 방법은 비활성 상태 대기열 inactiveQueues에서 지정한 대기열 요소보다 큰 집합을 가져오고 highestPrecedenceWaiting 값을 비활성 상태 대기열 inactiveQueues에서precedence의 가장 작은 값으로 설정합니다 (inactiveQueueues는 질서정연합니다)
---------------------------------------------------------------------------
본 시리즈의 Heritrix 3.1.0 원본 해석은 본인이 창작한 것입니다.
전재 는 출처 가 블로그 정원 고슴도치 의 온순함 을 밝혀 주십시오
본문 링크http://www.cnblogs.com/chenying99/archive/2013/04/21/3033510.html

좋은 웹페이지 즐겨찾기