ip_vs 분석 실현 (6)

14876 단어 .netcacheF#UP
이 문서 의 Copyleft 는 yfydz 소유 입 니 다. GPL 로 발표 하면 자 유 롭 게 복사, 전재 할 수 있 습 니 다. 전재 할 때 문서 의 완전 성 을 유지 하고 어떠한 상업 적 용도 로 도 사용 하 는 것 을 금지 합 니 다.
msn: [email protected]
원본:http://yfydz.cublog.cn
8. IPVS      
IPVS              ip_vs_bind_xmit()     ,               net/ipv4/ipvs/ip_vs_xmit.c 。
8.1 NAT  

NAT            ,       NAT
/*
 *      NAT transmitter (only for outside-to-inside nat forwarding)
 *      Not used for related ICMP
 */
int
ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
        struct ip_vs_protocol *pp)
{
 struct rtable *rt;  /* Route to the other host */
 int mtu;
 struct iphdr *iph = skb->nh.iph;
 EnterFunction(10);
 /* check if it is a connection of no-client-port */
 if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
//              0,   skb        
  __u16 _pt, *p;
  p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt);
  if (p == NULL)
   goto tx_error;
// *p    
  ip_vs_conn_fill_cport(cp, *p);
  IP_VS_DBG(10, "filled cport=%d
", ntohs(*p)); } // , ICMP if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) goto tx_error_icmp; /* MTU checking */ // MTU, MTU DF , ICMP , mtu = dst_mtu(&rt->u.dst); if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) { ip_rt_put(rt); icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for"); goto tx_error; } /* copy-on-write the packet before mangling it */ // skb IP if (!ip_vs_make_skb_writable(&skb, sizeof(struct iphdr))) goto tx_error_put; // skb MAC if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) goto tx_error_put; /* drop old route */ // skb cache dst_release(skb->dst); skb->dst = &rt->u.dst; /* mangle the packet */ // (TCP/UDP...) NAT, if (pp->dnat_handler && !pp->dnat_handler(&skb, pp, cp)) goto tx_error; // skb->nh.iph->daddr = cp->daddr; // IP ip_send_check(skb->nh.iph); IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); /* FIXME: when application helper enlarges the packet and the length is larger than the MTU of outgoing device, there will be still MTU problem. */ /* Another hack: avoid icmp_send in ip_fragment */ // don't fragment skb->local_df = 1; // , HOOK netfilter OUTPUT , OUTPUT IP_VS_XMIT(skb, rt); LeaveFunction(10); // STOLEN hook return NF_STOLEN; tx_error_icmp: dst_link_failure(skb); tx_error: LeaveFunction(10); kfree_skb(skb); return NF_STOLEN; tx_error_put: ip_rt_put(rt); goto tx_error; } 8.2 TUNNEL TUNNEL IP IPIP (4) , IP , IP IP, , IPVS. /* * IP Tunneling transmitter * * This function encapsulates the packet in a new IP packet, its * destination will be set to cp->daddr. Most code of this function * is taken from ipip.c. * * It is used in VS/TUN cluster. The load balancer selects a real * server from a cluster based on a scheduling algorithm, * encapsulates the request packet and forwards it to the selected * server. For example, all real servers are configured with * "ifconfig tunl0 <Virtual IP Address> up". When the server receives * the encapsulated packet, it will decapsulate the packet, processe * the request and return the response packets directly to the client * without passing the load balancer. This can greatly increase the * scalability of virtual server. * * Used for ANY protocol */ int ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp) { struct rtable *rt; /* Route to the other host */ struct net_device *tdev; /* Device to other host */ struct iphdr *old_iph = skb->nh.iph; u8 tos = old_iph->tos; __be16 df = old_iph->frag_off; struct iphdr *iph; /* Our new IP header */ int max_headroom; /* The extra header space needed */ int mtu; EnterFunction(10); // IP , ARP,IPX if (skb->protocol != __constant_htons(ETH_P_IP)) { IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, " "ETH_P_IP: %d, skb protocol: %d
", __constant_htons(ETH_P_IP), skb->protocol); goto tx_error; } // cache if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos)))) goto tx_error_icmp; // tdev = rt->u.dst.dev; // MTU mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); if (mtu < 68) { ip_rt_put(rt); IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68
"); goto tx_error; } // MTU if (skb->dst) skb->dst->ops->update_pmtu(skb->dst, mtu); // don't fragement df |= (old_iph->frag_off&__constant_htons(IP_DF)); if ((old_iph->frag_off&__constant_htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) { // skb MTU DF , ICMP , icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); ip_rt_put(rt); IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed
"); goto tx_error; } /* * Okay, now see if we can stuff it in the buffer as-is. */ // IP max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) { // skb , skb IP // struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) { ip_rt_put(rt); kfree_skb(skb); IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory
"); return NF_STOLEN; } // skb kfree_skb(skb); // skb , ip skb = new_skb; old_iph = skb->nh.iph; } // skb->h , IP , IP skb->h.raw = (void *) old_iph; /* fix old IP header checksum */ // IP ip_send_check(old_iph); // skb data IP IP skb->nh.raw = skb_push(skb, sizeof(struct iphdr)); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); /* drop old route */ // cache dst_release(skb->dst); skb->dst = &rt->u.dst; // IP /* * Push down and install the IPIP header. */ iph = skb->nh.iph; iph->version = 4; iph->ihl = sizeof(struct iphdr)>>2; iph->frag_off = df; // IPIP, 4 iph->protocol = IPPROTO_IPIP; iph->tos = tos; iph->daddr = rt->rt_dst; iph->saddr = rt->rt_src; iph->ttl = old_iph->ttl; iph->tot_len = htons(skb->len); // IP ID ip_select_ident(iph, &rt->u.dst, NULL); // IP ip_send_check(iph); /* Another hack: avoid icmp_send in ip_fragment */ // don't fragmemt skb->local_df = 1; // skb IP_VS_XMIT(skb, rt); LeaveFunction(10); return NF_STOLEN; tx_error_icmp: dst_link_failure(skb); tx_error: kfree_skb(skb); LeaveFunction(10); return NF_STOLEN; } 8.3 DR DR skb MAC MAC , ,IPVS 。 DR ,IPVS VIP, IP, VIP NOARP , ARP , VIP , IPVS , IPVS VIP ARP , , IP , , , VIP , 。 /* * Direct Routing transmitter * Used for ANY protocol */ int ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp) { struct rtable *rt; /* Route to the other host */ struct iphdr *iph = skb->nh.iph; int mtu; EnterFunction(10); // if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) goto tx_error_icmp; /* MTU checking */ // MTU mtu = dst_mtu(&rt->u.dst); if ((iph->frag_off&__constant_htons(IP_DF)) && skb->len > mtu) { icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); ip_rt_put(rt); IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed
"); goto tx_error; } /* * Call ip_send_check because we are not sure it is called * after ip_defrag. Is copy-on-write needed? */ // skb , if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { ip_rt_put(rt); return NF_STOLEN; } // IP ip_send_check(skb->nh.iph); /* drop old route */ // dst_release(skb->dst); // skb->dst = &rt->u.dst; /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; // IP_VS_XMIT(skb, rt); LeaveFunction(10); return NF_STOLEN; tx_error_icmp: dst_link_failure(skb); tx_error: kfree_skb(skb); LeaveFunction(10); return NF_STOLEN; } 8.4 NULL /* * NULL transmitter (do nothing except return NF_ACCEPT) */ int ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp) { /* we do not touch skb and do not need pskb ptr */ return NF_ACCEPT; } 8.5 , IPVS , IPVS /* * Bypass transmitter * Let packets bypass the destination when the destination is not * available, it may be only used in transparent cache cluster. */ int ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp) { struct rtable *rt; /* Route to the other host */ struct iphdr *iph = skb->nh.iph; u8 tos = iph->tos; int mtu; // IP key struct flowi fl = { .oif = 0, .nl_u = { .ip4_u = { .daddr = iph->daddr, .saddr = 0, .tos = RT_TOS(tos), } }, }; EnterFunction(10); // IP , IPVS if (ip_route_output_key(&rt, &fl)) { IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, " "dest: %u.%u.%u.%u
", NIPQUAD(iph->daddr)); goto tx_error_icmp; } // MTU /* MTU checking */ mtu = dst_mtu(&rt->u.dst); if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) { ip_rt_put(rt); icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed
"); goto tx_error; } /* * Call ip_send_check because we are not sure it is called * after ip_defrag. Is copy-on-write needed? */ // skb , if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { ip_rt_put(rt); return NF_STOLEN; } // IP ip_send_check(skb->nh.iph); /* drop old route */ // , dst_release(skb->dst); skb->dst = &rt->u.dst; /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; // IP_VS_XMIT(skb, rt); LeaveFunction(10); return NF_STOLEN; tx_error_icmp: dst_link_failure(skb); tx_error: kfree_skb(skb); LeaveFunction(10); return NF_STOLEN; } 8.6 ICMP ICMP /* * ICMP packet transmitter * called by the ip_vs_in_icmp */ int ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, int offset) { struct rtable *rt; /* Route to the other host */ int mtu; int rc; EnterFunction(10); /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be forwarded directly here, because there is no need to translate address/port back */ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { // TUNNEL DR if (cp->packet_xmit) rc = cp->packet_xmit(skb, cp, pp); else rc = NF_ACCEPT; /* do not touch skb anymore */ atomic_inc(&cp->in_pkts); goto out; } /* * mangle and send the packet here (only for VS/NAT) */ // if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(skb->nh.iph->tos)))) goto tx_error_icmp; /* MTU checking */ mtu = dst_mtu(&rt->u.dst); if ((skb->len > mtu) && (skb->nh.iph->frag_off&__constant_htons(IP_DF))) { ip_rt_put(rt); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed
"); goto tx_error; } /* copy-on-write the packet before mangling it */ // skb if (!ip_vs_make_skb_writable(&skb, offset)) goto tx_error_put; // skb MAC if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) goto tx_error_put; // /* drop the old route when skb is not shared */ dst_release(skb->dst); skb->dst = &rt->u.dst; // ICMP ip_vs_nat_icmp(skb, pp, cp, 0); /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; IP_VS_XMIT(skb, rt); rc = NF_STOLEN; goto out; tx_error_icmp: dst_link_failure(skb); tx_error: dev_kfree_skb(skb); rc = NF_STOLEN; out: LeaveFunction(10); return rc; tx_error_put: ip_rt_put(rt); goto tx_error; } ...... ......

좋은 웹페이지 즐겨찾기