epoll 소스 코드 비밀 탐지 (epoll ctl)

epoll 소스 코드 비밀 탐지 (epoll ctl)
일부 기본 적 인 데이터 구조
epitem
/* * Each file descriptor added to the eventpoll interface will * have an entry of this type linked to the "rbr" RB tree. * Avoid increasing the size of this struct, there can be many thousands * of these on a server and we do not want this to take another cache line.(      ) */
struct epitem {
    union {
        /* RB tree node links this structure to the eventpoll RB tree */
        struct rb_node rbn;
        /* Used to free the struct epitem */
        struct rcu_head rcu;
    };

    /* List header used to link this structure to the eventpoll ready list (          eventpoll    )*/
    struct list_head rdllink;

    /* * Works together "struct eventpoll"->ovflist in keeping the * single linked chain of items. */
    struct epitem *next;

    /* The file descriptor information this item refers to (        )*/
    struct epoll_filefd ffd;

    /* Number of active wait queue attached to poll operations (    )*/
    int nwait;

    /* List containing poll wait queues */
    struct list_head pwqlist;

    /* The "container" of this item */
    struct eventpoll *ep;

    /* List header used to link this item to the "struct file" items list */
    struct list_head fllink;

    /* wakeup_source used when EPOLLWAKEUP is set */
    struct wakeup_source __rcu *ws;

    /* The structure that describe the interested events and the source fd (              )*/
    struct epoll_event event;
};

eventpoll
/* * This structure is stored inside the "private_data" member of the file * structure and represents(  )the main data structure for the eventpoll * interface. */
struct eventpoll {
    /* Protect the access to this structure */
    spinlock_t lock;

    /* * This mutex is used to ensure that files are not removed * while epoll is using them. This is held during the event * collection loop, the file cleanup path, the epoll file exit * code and the ctl operations. */
    struct mutex mtx;

    /* Wait queue used by sys_epoll_wait() (   ,epoll       。 *  epoll_wait            ,   ep_poll_callback()        */
    wait_queue_head_t wq;

    /* Wait queue used by file->poll() (   , poll_wait eventpoll         , *            eventpoll            */
    wait_queue_head_t poll_wait;

    /* List of ready file descriptors (    )*/
    struct list_head rdllist;

    /* RB tree root used to store monitored fd structs (                 )*/
    struct rb_root rbr;

    /* * This is a single linked list(   ) that chains all the "struct epitem" that * happened while transferring ready events to userspace w/out * holding ->lock.(             ,                          , *             rdllist 。) */
    struct epitem *ovflist;

    /* wakeup_source used when ep_scan_ready_list is running */
    struct wakeup_source *ws;

    /* The user that created the eventpoll descriptor */
    struct user_struct *user;

    struct file *file;

    /* used to optimize loop detection check */
    int visited;
    struct list_head visited_list_link;
};

poll_table_struct
typedef struct poll_table_struct {
  poll_queue_proc _qproc; //      
  unsigned long _key;       //     ( epi->event.events  )
} poll_table;

eppoll_entry
/*      poll   hook*/
struct eppoll_entry {
  struct list_head llink;//  epitem
  struct epitem *base;    // epitem
  wait_queue_t wait;      //wait     
  wait_queue_head_t *whead;//wait         
};

__wait_queue
struct __wait_queue {
  unsigned int      flags;
#define WQ_FLAG_EXCLUSIVE 0x01
  void          *private;
  wait_queue_func_t func;
  struct list_head  task_list;
};

epoll_ctl 함수:
int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);

1 레벨: copyfrom_user () (사용자 공간 에서 관심 사 를 복사 해 야 하 는 지 확인)
if (ep_op_has_event(op) &&copy_from_user(&epds, event, sizeof(struct epoll_event)))
     goto error_return;

1 레벨: epfind () (빨간색 과 검은색 트 리 에서 fd 에 대응 하 는 파일 찾기)
epi = ep_find(ep, tf.file, fd);

2 레벨: epfind () 붉 은 검 은 나무 상호 배척 자 물 쇠 를 가 져 온 상황 에서
static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
{
    int kcmp;
    struct rb_node *rbp;
    struct epitem *epi, *epir = NULL;
    struct epoll_filefd ffd;

    ep_set_ffd(&ffd, file, fd);
    for (rbp = ep->rbr.rb_node; rbp; ) {
        epi = rb_entry(rbp, struct epitem, rbn);
        kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
        if (kcmp > 0)
            rbp = rbp->rb_right;
        else if (kcmp < 0)
            rbp = rbp->rb_left;
        else {
            epir = epi;
            break;
        }
    }

    return epir;
}

1 급: case 분기 문 (ctl 기능 선택)
이 함 수 는 우선 이벤트 폴 에서 작 동 하 는 fd 에 대응 하 는 epitem 대상 이 존재 하 는 지 찾 은 다음 사용자 가 지정 한 명령 파라미터 에 따라 해당 하 는 처 리 를 합 니 다.epoll 에 추 가 된 모든 파일 은 epitem 대상 에 추 가 됩 니 다.epoll 의 파일 삭제 와 파일 수정 명령 은 각각 ep 가 있 습 니 다.remove () 와 epmodify () 로 완성 합 니 다. 이 두 함 수 는 비교적 간단 하고 분석 을 많이 하지 않 습 니 다.주요 관심 사 는 epoll 의 추가 명령 에 대응 하 는 함수 ep 입 니 다.insert().
        switch (op) {
        case EPOLL_CTL_ADD:
            break;
        case EPOLL_CTL_DEL:
            break;
        case EPOLL_CTL_MOD:
            break;
        }

2 레벨: EPOLLCTL_ADD 작업 (clear tfile check list 가 가득 실행 되 었 다 면 ()
error = ep_insert(ep, &epds, tf.file, fd, full_check);

3 레벨: initpoll_funcptr () 리 셋 함수 초기 화
등록 epptable_queue_proc () 함수
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);

리 셋 함수 ep 등록 중ptable_queue_proc () 중
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
                 poll_table *pt)
{
    struct epitem *epi = ep_item_from_epqueue(pt);
    struct eppoll_entry *pwq;

    //  eppoll_entry          wait    
    if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
        //ep_poll_callback:  eppoll_entry hook     epitem  eventpoll       
        init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
        pwq->whead = whead;
        pwq->base = epi;
        /*    epoll_entry            ,        , *             (epoll_entry) ep_poll_callback */
        add_wait_queue(whead, &pwq->wait);
        list_add_tail(&pwq->llink, &epi->pwqlist);
        epi->nwait++;
    } else {
        /* We have to signal that an error occurred */
        epi->nwait = -1;
    }
}

반전 함수 eppoll_callback () 의 실현 과정 (대응 하 는 파일 설명자 에서 이벤트 가 발생 했 을 때 호출 됩 니 다)
static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
    int pwake = 0;
    unsigned long flags;
    struct epitem *epi = ep_item_from_wait(wait);
    struct eventpoll *ep = epi->ep;

    if ((unsigned long)key & POLLFREE) {
        ep_pwq_from_wait(wait)->whead = NULL;
        /*
         * whead = NULL above can race with ep_remove_wait_queue()
         * which can do another remove_wait_queue() after us, so we
         * can't use __remove_wait_queue(). whead->lock is held by
         * the caller.
         */
        list_del_init(&wait->task_list);
    }

    spin_lock_irqsave(&ep->lock, flags);

    /*
     * If the event mask does not contain any poll(2) event, we consider the
     * descriptor to be disabled. This condition is likely the effect of the
     * EPOLLONESHOT bit that disables the descriptor when an event is received,
     * until the next EPOLL_CTL_MOD will be issued.
     */
    if (!(epi->event.events & ~EP_PRIVATE_BITS))
        goto out_unlock;

    /*
     * Check the events coming with the callback. At this stage, not
     * every device reports the events in the "key" parameter of the
     * callback. We need to be able to handle both cases here, hence the
     * test for "key" != NULL before the event match test.
     */
    if (key && !((unsigned long) key & epi->event.events))
        goto out_unlock;

    /*
     * If we are transferring events to userspace, we can hold no locks
     * (because we're accessing user memory, and because of linux f_op->poll()
     * semantics). All the events that happen during that period of time are
     * chained(   ) in ep->ovflist and requeued later on.
     */
    if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
        if (epi->next == EP_UNACTIVE_PTR) {
             //            epi     ,   epi   eventpoll ovflist(epitem    )   
            epi->next = ep->ovflist;
            ep->ovflist = epi;
            if (epi->ws) {
                /*
                 * Activate(  ) ep->ws since epi->ws may get
                 * deactivated(  ) at any time.
                 */
                __pm_stay_awake(ep->ws);
            }

        }
        goto out_unlock;
    }

    /* If this file is already in the ready list we exit soon */
    if (!ep_is_linked(&epi->rdllink)) {
        list_add_tail(&epi->rdllink, &ep->rdllist);
        ep_pm_stay_awake_rcu(epi);
    }

    /*
     * Wake up ( if active ) both the eventpoll wait list and the ->poll()
     * wait list.
     */
    if (waitqueue_active(&ep->wq))
        wake_up_locked(&ep->wq);
    if (waitqueue_active(&ep->poll_wait))
        pwake++;

out_unlock:
    spin_unlock_irqrestore(&ep->lock, flags);

    /* We have to call this outside the lock */
    if (pwake)
        ep_poll_safewake(&ep->poll_wait);

    return 1;
}

3 레벨: eprbtree_insert () (붉 은 검 은 나무 에 노드 추가)
3 레벨: listadd_tail () (준비 대기 열 에 노드 추가)
static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
             struct file *tfile, int fd, int full_check)
{
    int error, revents, pwake = 0;
    unsigned long flags;
    long user_watches;
    struct epitem *epi;
    struct ep_pqueue epq;

    /* *   epoll               max_user_watches, * max_user_watches          epoll        *       */ 
    user_watches = atomic_long_read(&ep->user->epoll_watches);
    if (unlikely(user_watches >= max_user_watches))
        return -ENOSPC;
    /* *      epoll           epitem   , *          epitem  。 */  
    if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
        return -ENOMEM;

    /* Item initialization follow here ... */
    INIT_LIST_HEAD(&epi->rdllink);
    INIT_LIST_HEAD(&epi->fllink);
    INIT_LIST_HEAD(&epi->pwqlist);
    epi->ep = ep;
    ep_set_ffd(&epi->ffd, tfile, fd);
    epi->event = *event;
    epi->nwait = 0;
    epi->next = EP_UNACTIVE_PTR;
    if (epi->event.events & EPOLLWAKEUP) {
        error = ep_create_wakeup_source(epi);
        if (error)
            goto error_create_wakeup_source;
    } else {
        RCU_INIT_POINTER(epi->ws, NULL);
    }

    /* Initialize the poll table using the queue callback(  )(    ) */
    epq.epi = epi;
    init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);

    /* * Attach the item to the poll hooks and get current event bits. * We can safely use the file* here because its usage count has * been increased by the caller of this function. Note that after * this operation completes, the poll callback can start hitting * the new item. */
    /* *   fd    ,f_op socket_file_ops,poll    * sock_poll()。   TCP     ,      *  tcp_poll()  。    poll       *         ,   revents 。 *  poll     (tcp_poll()) ,   sock_poll_wait(), *  sock_poll_wait()     epq.pt.qproc     , *    ep_ptable_queue_proc()。 */  
    revents = ep_item_poll(epi, &epq.pt);

    /* * We have to check if something went wrong during the poll wait queue * install process. Namely an allocation for a wait queue failed due * high memory pressure. */
    /* * ep_ptable_queue_proc()          ,  *  nwait  -1。 */  
    error = -ENOMEM;
    if (epi->nwait < 0)
        goto error_unregister;

    /* Add the current item to the list of active epoll hook for ths file */
    spin_lock(&tfile->f_lock);
    list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
    spin_unlock(&tfile->f_lock);

    /* * Add the current item to the RB tree. All RB tree operations are * protected by "mtx", and ep_insert() is called with "mtx" held. */
    ep_rbtree_insert(ep, epi);

    /* now check if we've created too many backpaths */
    error = -EINVAL;
    if (full_check && reverse_path_check())
        goto error_remove_epi;

    /* We have to drop the new item inside our item list to keep track of it */
    spin_lock_irqsave(&ep->lock, flags);

    /* If the file is already "ready" we drop it inside the ready list */
    if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
        list_add_tail(&epi->rdllink, &ep->rdllist);
        ep_pm_stay_awake(epi);

        /* Notify waiting tasks that events are available */
        if (waitqueue_active(&ep->wq))
            wake_up_locked(&ep->wq);
        if (waitqueue_active(&ep->poll_wait))
            pwake++;
    }

    spin_unlock_irqrestore(&ep->lock, flags);

    atomic_long_inc(&ep->user->epoll_watches);

    /* We have to call this outside the lock */
    if (pwake)
        ep_poll_safewake(&ep->poll_wait);

    return 0;

error_remove_epi:
    spin_lock(&tfile->f_lock);
    list_del_rcu(&epi->fllink);
    spin_unlock(&tfile->f_lock);

    rb_erase(&epi->rbn, &ep->rbr);

error_unregister:
    ep_unregister_pollwait(ep, epi);

    /* * We need to do this because an event could have been arrived on some * allocated wait queue. Note that we don't care about the ep->ovflist * list, since that is used/cleaned only inside a section bound by "mtx". * And ep_insert() is called with "mtx" held. */
    spin_lock_irqsave(&ep->lock, flags);
    if (ep_is_linked(&epi->rdllink))
        list_del_init(&epi->rdllink);
    spin_unlock_irqrestore(&ep->lock, flags);

    wakeup_source_unregister(ep_wakeup_source(epi));

error_create_wakeup_source:
    kmem_cache_free(epi_cache, epi);

    return error;
}

2 레벨: EPOLLCTL_DEL 조작
error = ep_remove(ep, epi);

2 레벨: EPOLLCTL_MOD 조작
error = ep_modify(ep, epi, &epds);

1 단계: fdput () (임시 파일 설명자 두 개 닫 기)
    fdput(tf);
    fdput(f);

좋은 웹페이지 즐겨찾기