UDP发包提速的有关问题

UDP发包提速的问题
各位大侠们,帮我看下,这段代码,我想平均快速发送UDP包,计划每200 us发一个包,可是测试时总不能平均发送,速度也提不上来,不知道是怎么;回事~~~
C/C++ code
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/wait.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <sys/time.h>
#include <netinet/in.h>
#include <netdb.h>
#include <unistd.h>
#include <arpa/inet.h>
#include <sys/ioctl.h>
#include<pthread.h>

#include<math.h>

#define  int8      char
#define  uint8     unsigned char
#define  uint32    unsigned int
#define  ulong32   unsigned long
#define  long32    long
#define  int32     int
#define  long64    long long

//3600s*24h*(365days*70years+17days)
#define  From00to70 0x83aa7e80U

#define  NTPSVR            "192.168.2.30"                        //MY server
//#define  LOCAL              "192.168.2.109"
#define  LOCALPORT       8000
#define  NTPPORT           123
  
int                pthreads                    =  0;            //定义线程的数量
#define            timer                      200                   // 计算每x us发一个包                 
long int           total_pkt             =   0;                       //   计算发包量,初始化为0个   
int                 maxpkt                 =   0  ;               //一次发包量   
int                 counter                =      0;            //发送临时算数器
int                 fd[2];
int                 send_tmp             =     0;                 //发送临时计数器
int                 rec_maxpkt;                                   //接收总量
int                 rec_pkt                =     0;              //接收计数
    
 
pthread_mutex_t ntppack_mutex = PTHREAD_MUTEX_INITIALIZER;//init pthread
pthread_mutex_t newpack_mutex = PTHREAD_MUTEX_INITIALIZER;//init pthread


  int32  sockfd;
  struct timeval tv_start,tv_end;
  struct timezone tz_start,tz_end;
  struct sockaddr_in addr,local_addr;   
  pthread_t tidA;

/*
*NTP包的结构
*
*/
typedef struct NTPPACKET
{
  uint8       li_vn_mode;
  uint8       stratum;
  uint8        poll;
  uint8        precision;                                         //有符号整数表示本地时钟精确度
  ulong32   root_delay;                                   //到达服务器的一次往返的总延时,是15到16位有符号的定点小数
  ulong32   root_dispersion;                           // 到达服务器的一次标准误差,是15-16位的无符号的定点小数
  int8          ref_id[4];
  ulong32   reftimestamphigh;                      //本地时钟最后被设定或校正的时间T4
  ulong32   reftimestamplow;
  ulong32   oritimestamphigh;                //向服务器请求分离客户机的时间戳,采用64位时标格式T1
  ulong32   oritimestamplow;
  ulong32   recvtimestamphigh;             //向服务器请求到客户机的时间戳,采用64位时标格式T2
  ulong32   recvtimestamplow;
  ulong32   trantimestamphigh;           //向客户机答复分离服务器的时间戳,采用64位时标格式T3,用T3来校正本地时间 
  ulong32   trantimestamplow;
}NTPPacket;

NTPPacket  ntppack,newpack;

//定义为long64,解决32位数的符号位问题
long64   firsttimestamp,finaltimestamp;
long64   diftime,delaytime;

void NTP_Init()
{
  bzero(&ntppack,sizeof(ntppack));
  ntppack.li_vn_mode=0x1b;//0|(3<<2)|(3<<5);
  //获取初始时间戳T1
  firsttimestamp="From00to70"+time(NULL);//-8*3600;
  ntppack.oritimestamphigh=htonl(firsttimestamp);
}


/*
*按给定的要求每x us 发送一个请求
*
*/
void *send_pkt(void *);

void  *send_pkt  (void *vptr)
   { 
    //every pthread need send X packets
         int  num =0;
    struct timeval tv_send_start,tv_send_end;
    struct timezone tz_send_start,tz_send_end;

    for(num;num<maxpkt;num++)
        {
           
        pthread_mutex_trylock(&ntppack_mutex);                     //加锁   
         counter=send_tmp+1;
                 //发送数据请求包
         sendto(sockfd,&ntppack,sizeof(ntppack),0,
                                  (struct sockaddr *)&addr,sizeof(struct sockaddr));           
         send_tmp=counter;        
         printf(" A have been send  %d  packets\n",counter);        
         pthread_mutex_unlock(&ntppack_mutex);                  //解锁


       usleep(timer);     //计算每X us发一个请求   
    
   }  

volatile pid_t pid;

int create_fork()
{
  int num_fork=0;
  for (num_fork;num_fork<4;num_fork++)
      pid=fork();
  return pid;
}

int main( )
{

  int    tid_capacity  =  0;                            //第几个线程
  int rebind_port = 1;
    int counterA=0,counterA2=0,counterB=0,counterB2=0,counterC=0,counterC2=0,counterD=0,counterD2=0;
    int counter_tmp=0;
   int fork_process=0;
 
  printf("How many packets do you want to send :");
  scanf("%ld",&total_pkt);

 printf("How many pthreads do you need: ");
 scanf("%d",&pthreads);
 
maxpkt=total_pkt/pthreads;
 
//服务器结构填充
   addr.sin_family=AF_INET;   //IPV4协议
   addr.sin_port =htons(NTPPORT);   //NTP专用的123端口
   addr.sin_addr.s_addr=inet_addr(NTPSVR);   //校时服务器
   bzero(&(addr.sin_zero),8);   //清零 

 //本地信息结构填充
   local_addr.sin_family=AF_INET;
   local_addr.sin_port=htons(LOCALPORT);
   local_addr.sin_addr.s_addr=INADDR_ANY;                                    //inet_addr(LOCAL);
   bzero(&(local_addr.sin_zero),8); 

   
   
   NTP_Init();    
   int status;
   gettimeofday(&tv_start,&tz_start);
   create_fork();
   
     if(pid>0)
        {         
   for (tid_capacity;tid_capacity<pthreads;tid_capacity++)
         {   
          if((sockfd=socket(AF_INET,SOCK_DGRAM,0))<0)
            {
            perror("create socket error!\n");
             exit(1);
            }                   
        pthread_create (&tidA,NULL,send_pkt,NULL);            //创建线程      
        pthread_join (tidA,NULL);                                    //回收线程                             
        close(sockfd);
          }
        waitpid(pid,&status,0);
        printf("status %d\n",status);
     }
     
    else if(pid==0)
        {
       for (tid_capacity;tid_capacity<pthreads;tid_capacity++)
           {
                  if((sockfd=socket(AF_INET,SOCK_DGRAM,0))<0)
               {
                perror("create socket error!\n");
                exit(1);
                } 
           pthread_create (&tidA,NULL,send_pkt,NULL);            //创建线程      
           pthread_join (tidA,NULL);                                    //回收线程                             
           close(sockfd);
           }
          }

     gettimeofday(&tv_end,&tz_end);
     printf("use pthreads = %d\n",pthreads);
     printf("%d us send a packet\n",timer);    
     long long use_time=1000000*((long long)tv_end.tv_sec - (long long)tv_start.tv_sec)
                                           +(tv_end.tv_usec-tv_start.tv_usec);
     printf("total run time %lld us\n",use_time );
     exit(1);    

}