在进行异步I/O之前需要先初始化AIO控制块,调用aio_read()函数和aio_write()函数来进行异步读写操作
#include<aio.h> int aio_read(struct aiocb *aiocb); int aio_write(struct aiocb *aiocb);
两个函数的返回值:若成功,返回0;若出错,返回-1;
当这些函数返回成功时,异步I/O请求便已经被操作系统放入等待处理的队列中了。这些返回值与实际I/O操作结果没有任何的关系。I/O操作在等待时必须注意确保AIO控制块和数据库缓冲区保持稳定;他下面的内存必须是合法的,除非I/O操作完成,否则不能被复用。
要想强制所有等待中的异步函数不等待而写入持久化的存储中,可以设立一个AIO控制块并调用aio_fsync函数
#include<aio.h> imt aio_fsync(int op,struct aiocb *aiocb);
成功返回0;出错返回-1;
AIO控制块中的aio_fildes字段指定了其异步写操作被同步的文件。如果op参数设定为O_DSYNC,那么操作执行起来就会像调用了fdatasync一样。否则,如果op参数设定为O_SYNC,
那么操作执行起来就会像调用了fsync一样。
像aio_read()和aio_write()函数一样,在安排了同步时,aio_fsync操作返回。在异步同步操作完成之前,数据不会被持久化。AIO控制模块控制我们如何被通知和aio_read和aio_write()函数一样。
int aio_error(struct aiocb *aiopcb); //功能:主动查询 判断当前aio的状态 只是查询状态 返回值: EINPROGRESS I/O还没完成 ECANCELLED I/O被取消 -1 发生错误 正常 I/O已处理完成
ssize_t aio_return(struct aiocb *paiocb); // 返回值为此次IO的数据长度
执行I/O操作时,还有其他事务要处理而不想被I/O操作阻塞,就可以使用异步I/O。然而如果在完成所有事务时,还有异步操作未完成时,可以调用aio_suspend函数来阻塞进程,直到操作完成。
int aio_suspend(const struct aiocb *const cblist[],int n,const struct timespec *timeout);
//成功返回0,失败返回-1
取消I/O操作
int aio_cancel(int fd,struct aiocb *aiocb); /* //fd指定了未完成的异步操作的文件描述符, 返回值: AIO_CANCELED 取消成功 AIO_NOTCANCELED 至少有一个要求的操作没有被取消 AIO_ALLDONE 所有操作在尝试取消前已经完成 -1 调用失败*/
AIO 提供了一种方法使用 lio_listio
API 函数同时发起多个传输。这个函数非常重要,因为这意味着我们可以在一个系统调用(一次内核上下文切换)中启动大量的 I/O 操作。从性能的角度来看,这非常重要,因此值得我们花点时间探索一下。
int lio_listio( int mode, struct aiocb *list[], int nent, struct sigevent *sig );
mode
参数可以是 LIO_WAIT
或 LIO_NOWAIT
。LIO_WAIT
会阻塞这个调用,直到所有的 I/O 都完成为止。在操作进行排队之后,LIO_NOWAIT
就会返回。list
是一个 aiocb
引用的列表,最大元素的个数是由 nent
定义的。注意 list
的元素可以为 NULL
,lio_listio
会将其忽略。sigevent
引用定义了在所有 I/O 操作都完成时产生信号的方法。
对于 lio_listio
的请求与传统的 read
或 write
请求在必须指定的操作方面稍有不同,
struct aiocb aiocb1, aiocb2; struct aiocb *list[MAX_LIST]; ... /* Prepare the first aiocb */ aiocb1.aio_fildes = fd; aiocb1.aio_buf = malloc( BUFSIZE+1 ); aiocb1.aio_nbytes = BUFSIZE; aiocb1.aio_offset = next_offset; aiocb1.aio_lio_opcode = LIO_READ; ... bzero( (char *)list, sizeof(list) ); list[0] = &aiocb1; list[1] = &aiocb2; ret = lio_listio( LIO_WAIT, list, MAX_LIST, NULL );
对于读操作来说,aio_lio_opcode
域的值为 LIO_READ
。对于写操作来说,我们要使用 LIO_WRITE
,不过 LIO_NOP
对于不执行操作来说也是有效的
aiocb结构定义了AIO控制。该结构体包括以下字段
struct aiocb { int aio_fildes //File descriptor. 读或写的文件描述符 off_t aio_offset //File offset. 指定偏移量 volatile void *aio_buf //Location of buffer. 缓冲区 size_t aio_nbytes //Length of transfer. 要读写的字节数 int aio_reqprio //Request priority offset.请求提示顺序 struct sigevent aio_sigevent //Signal number and value. I/O事件完成后通知应用程序 int aio_lio_opcode //Operation to be performed. 基于列表的异步I/O };
struct sigevent
struct sigevent { int sigev_notify; //Notification type. 字段控制通知类型 int sigev_signo; //Signal number. union sigval sigev_value; //Signal value. void (*sigev_notify_function)(union sigval); //Notification function. pthread_attr_t *sigev_notify_attributes; //Notification attributes. };
sigev_notify 的取值范围如下,只有3种情况(对应的宏在<signal.h>中定义)。
SIGEV_NONE
事件发生时,什么也不做.
SIGEV_SIGNAL
事件发生时,将sigev_signo 指定的信号(A queued signal)发送给指定的进程.
SIGEV_THREAD
事件发生时,内核会(在此进程内)以sigev_notification_attributes为线程属性创建一个线程,并且让它执行sigev_notify_function,传入sigev_value作为为一个参数.
sigev_signo
在sigev_notify = SIGEV_SIGNAL 时使用,指定信号的种别(number).
sigev_value
在sigev_notify = SIGEV_THREAD 时使用,作为sigev_notify_function 的参数.
union sigval { int sival_int; void *sival_ptr; };
(*sigev_notify_function)(union sigval)
函数指针(指向通知执行函数),在sigev_notify = SIGEV_THREAD 时使用, 其他情况下置为NULL.
sigev_notify_attributes
指向线程属性的指针,在sigev_notify = SIGEV_THREAD 时使用,指定创建线程的属性, 其他情况下置为NULL.
示例:
#include <stdio.h> #include <unistd.h> #include <fcntl.h> #include <string.h> #include <inttypes.h> #include <stdlib.h> #include <libaio.h> #define _GUN_SOURCE #define BUFFER_SIZE 4096 int main(int argc,char **argv) { io_context_t ctx =0; struct iocb cb; struct iocb *cbs[1]; unsigned char *buf; struct io_event events[1]; int fd,ret; fd = open("test.txt",O_RDWR); if(fd < 0) { perror("test.txt"); goto error; } //为iocb分配一个内存 ret=posix_memalign((void **)&buf,512,(BUFFER_SIZE+1)); if(ret<0) { perror("posix_memalign error\n "); goto error1; } memset(buf,0,BUFFER_SIZE+1); //设置当前aio_context_t ret=io_setup(256,&ctx); if(ret<0) { perror("io_setup error\n "); goto error2; } //设置当前iocb io_prep_pread(&cb,fd,buf,BUFFER_SIZE,0); //设置iocb的指针数组 cbs[0] = &cb; ret=io_submit(ctx,1,cbs); //返回值为AIO下发成功的个数 if(ret!=1) { if(ret<0) { perror("io_submit error\n "); } else { perror("io_submit can't finish\n "); } goto error3; } //等待AIO返回有两种方式 //1.设置回掉函数 调用io_set_callback来设置 //2.调用io_getevents,进行阻塞等待 struct io_event *events 作为返回指针 //ret返回期望完成的最少AIO个数 ret=io_getevents(ctx,1,1,events,NULL); if(ret!=1) { if(ret<0) { perror("io_getevents error\n "); } else { perror("io_getevents can't finish\n "); } goto error3; } //解析完成的AIO请求 if(events[0].res2==0) { //正确读出 printf("%s\n",buf); } else { printf("aio error\n"); goto error3; } //AIO完成撤销所有下发的AIO if((ret=io_destroy(ctx))<0) { perror("io_destroy error\n "); goto error2; } free(buf); close(fd); return 0; error3: if((ret=io_destroy(ctx))<0) perror("io_destroy error\n "); error2: free(buf); error1: close(fd); error: return -1; }
// TODO arrange in alphabetical order, separated by type #include <stdlib.h> #include <stdio.h> //#include <atomic.h> #include <iostream> #include <string.h> #include <signal.h> #include <stdint.h> #include <unistd.h> #include <fcntl.h> #include <sys/stat.h> #include <sys/types.h> #include <sys/time.h> #include <aio.h> #include <semaphore.h> #include <ftw.h> #include <errno.h> #include <time.h> #define BUF_MAX 128 #define FD_MAX 1000 static size_t buffer_size; static uint64_t page_size; static uint64_t num_pages; std::atomic<unsigned long> num_requests; static sem_t blocking_waiter; static std::string src; static std::string dst; void aio_write_handler(sigval_t signal); typedef struct handler_context { struct aiocb* m_aiocb; size_t m_offset; size_t m_file_size; int m_src_fd; int m_dst_fd; } handler_context; void aio_read_handler (sigval_t sigval) { size_t nbytes; size_t w_nbytes = 0; handler_context* hctx = (handler_context*)sigval.sival_ptr; if (aio_error(hctx->m_aiocb)) { perror("read aio error"); exit(-1); } nbytes = aio_return(hctx->m_aiocb); int i = 0; void * buffer = (void *)hctx->m_aiocb->aio_buf; /*w_nbytes = pwrite(hctx->m_dst_fd, buffer, nbytes, hctx->m_offset); if (w_nbytes != nbytes) { perror("sync write error"); exit(-1); } sem_post(&blocking_waiter);*/ // now send an async write request for the destination file // init aiocb struct struct aiocb* w_aiocb = (struct aiocb*)malloc(sizeof(struct aiocb)); handler_context* w_context = (handler_context *) malloc(sizeof(handler_context)); bzero ((char *)w_context, sizeof(handler_context)); bzero ((char *)w_aiocb, sizeof(struct aiocb)); // context to be passed to handler w_context->m_aiocb = w_aiocb; w_context->m_offset = hctx->m_offset; w_context->m_file_size = hctx->m_file_size; w_context->m_src_fd = hctx->m_src_fd; w_context->m_dst_fd = hctx->m_dst_fd; // basic setup w_aiocb->aio_fildes = hctx->m_dst_fd; w_aiocb->aio_nbytes = nbytes; w_aiocb->aio_offset = hctx->m_offset; w_aiocb->aio_buf = buffer; // thread callback w_aiocb->aio_sigevent.sigev_notify = SIGEV_THREAD; w_aiocb->aio_sigevent.sigev_notify_function = aio_write_handler; w_aiocb->aio_sigevent.sigev_notify_attributes = NULL; w_aiocb->aio_sigevent.sigev_value.sival_ptr = (void *)w_context; if (aio_write(w_aiocb) < 0) { perror("aio_write error"); exit(-1); } ++num_requests; sem_post(&blocking_waiter); } void aio_write_handler (sigval_t sigval) { size_t nbytes; handler_context* hctx = (handler_context*)sigval.sival_ptr; if (aio_error(hctx->m_aiocb)) { perror("write aio error"); exit(-1); } nbytes = aio_return(hctx->m_aiocb); sem_post(&blocking_waiter); //free(hctx->m_aiocb->aio_buf); } int copy_regular (const char* src_file, const char* dst_file) { int src_fd; int dst_fd; uint64_t num_pages; void * buffer_block; // get the page_size for the system page_size = getpagesize(); struct stat stat_buf, stat_dst; // stat the source file if (stat(src_file, &stat_buf) < 0) { perror("source file stat error"); exit(-1); } // if its a directory, create and exit if (S_ISDIR(stat_buf.st_mode)) { if (mkdir(dst_file, S_IRWXU | S_IRWXG)) { perror("mkdir error"); exit(-1); } return 0; } // open the source file for reading if ((src_fd = open(src_file, O_RDONLY)) < 0) { perror("source file open error"); exit(-1); } // open the destination file for writing if ((dst_fd = open(dst_file, O_WRONLY| O_CREAT, stat_buf.st_mode)) < 0) { //std::cout << "file " <<dst_file<<std::endl; perror("destination file open error"); exit(-1); } if (fstat(dst_fd, &stat_dst)) { perror("fstat destination error"); exit(-1); } // check if input and output are the same if (stat_buf.st_dev == stat_dst.st_dev && stat_buf.st_ino == stat_dst.st_ino) { return 0; } // TODO tell the kernel that we will need the input file posix_fadvise(src_fd, 0, stat_buf.st_size, POSIX_FADV_WILLNEED); // more efficient space allocation via fallocate for dst file if (fallocate(dst_fd, 0, 0, stat_buf.st_size) < 0) { perror("destination file fallocate"); } // decide the number of pages in the input file and malloc a buffer accordingly num_pages = stat_buf.st_size / page_size + 1; buffer_size = page_size; //(num_pages < BUF_MAX) ? (num_pages * page_size) : (BUF_MAX * page_size); // now start sending aio read requests size_t i; for (i = 0; i < stat_buf.st_size; i += buffer_size) { //posix_fadvise(src_fd, i, buffer_size, POSIX_FADV_SEQUENTIAL); buffer_block = (void *)malloc(buffer_size); if (errno == ENOMEM) { perror("malloc for buffer error"); exit(-1); } // init aiocb struct struct aiocb* r_aiocb = (struct aiocb*)malloc(sizeof(struct aiocb)); handler_context* r_context = (handler_context *) malloc(sizeof(handler_context)); bzero ((char *)r_context, sizeof(handler_context)); bzero ((char *)r_aiocb, sizeof(struct aiocb)); // context to be passed to handler r_context->m_aiocb = r_aiocb; r_context->m_offset = i; r_context->m_file_size = stat_buf.st_size; r_context->m_src_fd = src_fd; r_context->m_dst_fd = dst_fd; // basic setup r_aiocb->aio_fildes = src_fd; r_aiocb->aio_nbytes = buffer_size; r_aiocb->aio_offset = i; r_aiocb->aio_buf = buffer_block; // thread callback r_aiocb->aio_sigevent.sigev_notify = SIGEV_THREAD; r_aiocb->aio_sigevent.sigev_notify_function = aio_read_handler; r_aiocb->aio_sigevent.sigev_notify_attributes = NULL; r_aiocb->aio_sigevent.sigev_value.sival_ptr = (void *)r_context; if (aio_read(r_aiocb) < 0) { perror("aio_read error"); exit(-1); } ++num_requests; } return 0; } std::string split_filename(std::string fname, int depth) { uint64_t pos; pos = fname.length(); if (fname.find_last_of("/\\", pos -1) == fname.length() - 1) { --pos; } for (uint64_t i = 0; i < depth; ++i){ pos = fname.find_last_of("/\\", pos -1); } // std::cout << "split " << fname.substr(pos) << std::endl; return fname.substr(pos); } int tree_walk (const char* fpath, const struct stat* sb, int typeflag, struct FTW* ftwbuf) { if (ftwbuf->level == 0) { return 0; } std::string new_dst_path = dst + split_filename(std::string(fpath), ftwbuf->level); copy_regular(fpath, new_dst_path.c_str()); return 0; } std::string format_path(std::string path) { uint64_t pos; pos = path.find('/'); if (pos == path.length() -1 || pos == std::string::npos ) { std::string fpath = "./"; fpath.append(path); return fpath; } return path; } int main(int argc, char * argv[]) { if (argc != 3) { printf("usage : %s <source> <destination>\n.", argv[0]); return 0; } struct timespec tv1, tv2; num_requests = 0; clock_gettime(CLOCK_MONOTONIC, &tv1); sem_init(&blocking_waiter, 0, 0); //copy_regular(argv[1], argv[2]); src = argv[1]; dst = argv[2]; uint64_t i; src = format_path(src); dst = format_path(dst); struct stat src_stat, dst_stat; if (stat(src.c_str(), &src_stat)) { perror("source file stat error"); exit(-1); } if (stat(dst.c_str(), &dst_stat)) { // if error, must be because of a no entry if (errno != ENOENT) { perror("destination file stat error"); exit(-1); } // new now check if we need to copy a file or directory if (S_ISDIR(src_stat.st_mode)) { // try creating the root at the destination if(mkdir(dst.c_str(), S_IRWXU | S_IRWXG)) { perror("destination mkdir failed"); exit(-1); } // traverse the entire tree and copy files or directories if (nftw(src.c_str(), tree_walk, FD_MAX, FTW_PHYS)) { perror("nftw traversal error"); exit(-1); } } else { // is a file copy_regular(src.c_str(), dst.c_str()); } } else { // dst already exists if (S_ISDIR(src_stat.st_mode)) { // if dir -> file error if (!S_ISDIR(dst_stat.st_mode)) { perror ("cannot copy directory to non-directory"); exit(-1); } // dir -> dir dst.append(split_filename(src, 1)); //std::cout <<"dst " << dst << "\n"; if (mkdir(dst.c_str(), S_IRWXU | S_IRWXG)) { perror("destination mkdir failed"); exit(-1); } if (nftw(src.c_str(), tree_walk, FD_MAX, FTW_PHYS)) { perror("nftw traversal error"); exit(-1); } } else { if (!S_ISDIR(dst_stat.st_mode)) { // file -> file overwrite copy_regular(src.c_str(), dst.c_str()); } else { // file -> dir dst.append(split_filename(src, 1)); copy_regular(src.c_str(), dst.c_str()); } } } for (i = 0; i < num_requests; ++i) { sem_wait(&blocking_waiter); } sem_destroy(&blocking_waiter); clock_gettime(CLOCK_MONOTONIC, &tv2); uint64_t tv = (tv2.tv_sec - tv1.tv_sec) * 1000000000+ tv2.tv_nsec -tv1.tv_nsec; printf("completion time = %ld.%06ld s\n", tv / 1000000000, tv % 1000000000); return 0; }