文件复制
使用c语言实现一下文件复制功能:
第一个版本:使用fgetc, fputc。
#include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <sys/sendfile.h> #include <stdio.h> #include <unistd.h> int main(int argc, char **argv) { const char *fromfile = argv[1]; const char *tofile = argv[2]; char c; FILE *fromfd = fopen(fromfile, "r"); FILE *tofd = fopen(tofile, "w"); while ((c = fgetc(fromfd)) != EOF) { fputc(c, tofd); } fclose(fromfd); fclose(tofd); }
使用fallocate -l 512M a1.txt, 生成一个512M的文件
garlic@garlic:~/sendfile$ sudo strace -c ./fget a1.txt b1.txt % time seconds usecs/call calls errors syscall ------ ----------- ----------- --------- --------- ---------------- 58.23 4.191305 31 131072 write 41.77 3.006373 22 131074 read 0.00 0.000051 12 4 close 0.00 0.000000 0 8 mmap 0.00 0.000000 0 3 mprotect 0.00 0.000000 0 1 munmap 0.00 0.000000 0 3 brk 0.00 0.000000 0 2 pread64 0.00 0.000000 0 1 1 access 0.00 0.000000 0 1 execve 0.00 0.000000 0 2 1 arch_prctl 0.00 0.000000 0 1 set_tid_address 0.00 0.000000 0 4 openat 0.00 0.000000 0 4 newfstatat 0.00 0.000000 0 1 set_robust_list 0.00 0.000000 0 1 prlimit64 0.00 0.000000 0 1 getrandom 0.00 0.000000 0 1 rseq ------ ----------- ----------- --------- --------- ---------------- 100.00 7.197729 27 262184 2 total
使用strace统计一下结果, 耗时7s, 主要系统调用是read和write, 分别进行了13万左右的调用。(第二次运行快一些,应该是这部分数据还没有换出内存)
garlic@garlic:~/sendfile$ sudo strace -c ./fget a.txt b.txt % time seconds usecs/call calls errors syscall ------ ----------- ----------- --------- --------- ---------------- 67.94 1.549128 11 131072 write 32.04 0.730500 5 131074 read 0.00 0.000106 26 4 openat 0.00 0.000063 21 3 mprotect 0.00 0.000049 12 4 newfstatat 0.00 0.000046 15 3 brk 0.00 0.000038 38 1 munmap 0.00 0.000022 22 1 getrandom 0.00 0.000021 21 1 prlimit64 0.00 0.000011 2 4 close 0.00 0.000000 0 8 mmap 0.00 0.000000 0 2 pread64 0.00 0.000000 0 1 1 access 0.00 0.000000 0 1 execve 0.00 0.000000 0 2 1 arch_prctl 0.00 0.000000 0 1 set_tid_address 0.00 0.000000 0 1 set_robust_list 0.00 0.000000 0 1 rseq ------ ----------- ----------- --------- --------- ---------------- 100.00 2.279984 8 262184 2 total
第二个版本:使用fread, fwrite: 这个版本可以读取指定大小的内容到定义的一段自定义缓冲区中
#include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <sys/sendfile.h> #include <stdio.h> #include <unistd.h> #define BUF_SIZE 4096 int main(int argc, char **argv) { char buf[BUF_SIZE]; const char *fromfile = argv[1]; const char *tofile = argv[2]; FILE *fromfd = fopen(fromfile, "r"); FILE *tofd = fopen(tofile, "w"); size_t n; while ((n=fread(buf, 1, sizeof(buf), fromfd)) > 0) { fwrite(buf, 1, n, tofd); } fclose(fromfd); fclose(tofd); }
garlic@garlic:~/sendfile$ fallocate -l 512M a2.txt garlic@garlic:~/sendfile$ sudo strace -c ./fread a2.txt b2.txt % time seconds usecs/call calls errors syscall ------ ----------- ----------- --------- --------- ---------------- 57.19 4.189182 31 131072 write 42.78 3.133982 23 131074 read 0.01 0.000536 536 1 execve 0.01 0.000369 46 8 mmap 0.00 0.000209 52 4 openat 0.00 0.000123 30 4 newfstatat 0.00 0.000118 39 3 mprotect 0.00 0.000084 28 3 brk 0.00 0.000076 19 4 close 0.00 0.000058 29 2 pread64 0.00 0.000052 26 2 1 arch_prctl 0.00 0.000048 48 1 munmap 0.00 0.000035 35 1 1 access 0.00 0.000027 27 1 getrandom 0.00 0.000026 26 1 prlimit64 0.00 0.000026 26 1 rseq 0.00 0.000025 25 1 set_tid_address 0.00 0.000025 25 1 set_robust_list ------ ----------- ----------- --------- --------- ---------------- 100.00 7.325001 27 262184 2 total
再与第一个方案对比是发现变换不是很大, 可以看到两个方案调用read,write次数是一样的。
可以看下fgetc的具体实现,针对复制文件这种场景,使用glibc提供的两对库函数fgetc/fputc, fread/fwrite在最终api调用次数上几乎是一致的。
另外还有一个地方开始的时候没有注意到,第二个方案申请的buffer刚好是4096.
fgetc是glibc一个库函数:
https://elixir.bootlin.com/glibc/glibc-2.38.9000/source/libio/getc.c#L48
int _IO_getc (FILE *fp) { int result; CHECK_FILE (fp, EOF); if (!_IO_need_lock (fp)) return _IO_getc_unlocked (fp); _IO_acquire_lock (fp); result = _IO_getc_unlocked (fp); _IO_release_lock (fp); return result; } #undef getc weak_alias (_IO_getc, getc) weak_alias (_IO_getc, fgetc) ...
_IO_getc 通过 _IO_getc_unlocked 获取结果,
https://elixir.bootlin.com/glibc/glibc-2.38.9000/source/libio/libio.h#L163
#define _IO_getc_unlocked(_fp) __getc_unlocked_body (_fp)
https://elixir.bootlin.com/glibc/glibc-2.38.9000/source/libio/bits/types/struct_FILE.h#L102
#define __getc_unlocked_body(_fp) \ (__glibc_unlikely ((_fp)->_IO_read_ptr >= (_fp)->_IO_read_end) \ ? __uflow (_fp) : *(unsigned char *) (_fp)->_IO_read_ptr++)
可以看到fp结果体中定义了一些列指针
/* The tag name of this struct is _IO_FILE to preserve historic C++ mangled names for functions taking FILE* arguments. That name should not be used in new code. */ struct _IO_FILE { int _flags; /* High-order word is _IO_MAGIC; rest is flags. */ /* The following pointers correspond to the C++ streambuf protocol. */ char *_IO_read_ptr; /* Current read pointer */ char *_IO_read_end; /* End of get area. */ char *_IO_read_base; /* Start of putback+get area. */ char *_IO_write_base; /* Start of put area. */ char *_IO_write_ptr; /* Current put pointer. */ char *_IO_write_end; /* End of put area. */ char *_IO_buf_base; /* Start of reserve area. */ char *_IO_buf_end; /* End of reserve area. */ ...
用户缓冲区判断, 当前read指针(_IO_read_ptr)大于等等于缓冲区最后位置(_IO_read_end)时, 说明缓冲为空,调用 __uflow 申请缓存
https://elixir.bootlin.com/glibc/glibc-2.37.9000/source/libio/libioP.h#L325
extern const struct _IO_jump_t _IO_file_jumps; libc_hidden_proto (_IO_file_jumps) extern const struct _IO_jump_t _IO_file_jumps_mmap attribute_hidden; extern const struct _IO_jump_t _IO_file_jumps_maybe_mmap attribute_hidden; extern const struct _IO_jump_t _IO_wfile_jumps; libc_hidden_proto (_IO_wfile_jumps) extern const struct _IO_jump_t _IO_wfile_jumps_mmap attribute_hidden; extern const struct _IO_jump_t _IO_wfile_jumps_maybe_mmap attribute_hidden; extern const struct _IO_jump_t _IO_old_file_jumps attribute_hidden; extern const struct _IO_jump_t _IO_streambuf_jumps; extern const struct _IO_jump_t _IO_old_proc_jumps attribute_hidden; extern const struct _IO_jump_t _IO_str_jumps attribute_hidden; extern const struct _IO_jump_t _IO_wstr_jumps attribute_hidden;
当前理解: glibc将通用逻辑放到各个api中,将不同的部分放到_IO_jump_t 做为api补充。
通过underflow索引找到对应的处理函数_IO_file_underflow
const struct _IO_jump_t _IO_file_jumps libio_vtable = { JUMP_INIT_DUMMY, JUMP_INIT(finish, _IO_file_finish), JUMP_INIT(overflow, _IO_file_overflow), JUMP_INIT(underflow, _IO_file_underflow), JUMP_INIT(uflow, _IO_default_uflow), JUMP_INIT(pbackfail, _IO_default_pbackfail), JUMP_INIT(xsputn, _IO_file_xsputn), JUMP_INIT(xsgetn, _IO_file_xsgetn), JUMP_INIT(seekoff, _IO_new_file_seekoff), JUMP_INIT(seekpos, _IO_default_seekpos), JUMP_INIT(setbuf, _IO_new_file_setbuf), JUMP_INIT(sync, _IO_new_file_sync), JUMP_INIT(doallocate, _IO_file_doallocate), JUMP_INIT(read, _IO_file_read), JUMP_INIT(write, _IO_new_file_write), JUMP_INIT(seek, _IO_file_seek), JUMP_INIT(close, _IO_file_close), JUMP_INIT(stat, _IO_file_stat), JUMP_INIT(showmanyc, _IO_default_showmanyc), JUMP_INIT(imbue, _IO_default_imbue) }; libc_hidden_data_def (_IO_file_jumps)
调用 _IO_default_uflow
int _IO_default_uflow (FILE *fp) { int ch = _IO_UNDERFLOW (fp); if (ch == EOF) return EOF; return *(unsigned char *) fp->_IO_read_ptr++; }
调用了 _IO_UNDERFLOW , 在这里申请缓冲区
/* The 'underflow' hook tries to fills the get buffer. It returns the next character (as an unsigned char) or EOF. The next character remains in the get buffer, and the get position is not changed. It matches the streambuf::underflow virtual function. */ #define _IO_UNDERFLOW(FP) JUMP0 (__underflow, FP)
#define JUMP0(FUNC, THIS) (_IO_JUMPS_FUNC(THIS)->FUNC) (THIS) --> #if _IO_JUMPS_OFFSET # define _IO_JUMPS_FUNC(THIS) \ (IO_validate_vtable \ (*(struct _IO_jump_t **) ((void *) &_IO_JUMPS_FILE_plus (THIS) \ + (THIS)->_vtable_offset))) 。。。。
int _IO_new_file_underflow (FILE *fp) { ssize_t count; /* C99 requires EOF to be "sticky". */ if (fp->_flags & _IO_EOF_SEEN) return EOF; if (fp->_flags & _IO_NO_READS) { fp->_flags |= _IO_ERR_SEEN; __set_errno (EBADF); return EOF; } if (fp->_IO_read_ptr < fp->_IO_read_end) return *(unsigned char *) fp->_IO_read_ptr; if (fp->_IO_buf_base == NULL) { /* Maybe we already have a push back pointer. */ if (fp->_IO_save_base != NULL) { free (fp->_IO_save_base); fp->_flags &= ~_IO_IN_BACKUP; } _IO_doallocbuf (fp); } 。。。。
如果缓冲区其实地址为空, 通过 _IO_doallocbuf 申请
void _IO_doallocbuf (FILE *fp) { if (fp->_IO_buf_base) return; if (!(fp->_flags & _IO_UNBUFFERED) || fp->_mode > 0) if (_IO_DOALLOCATE (fp) != EOF) return; _IO_setb (fp, fp->_shortbuf, fp->_shortbuf+1, 0); }
再用_IO_DOALLOCATE查表 _IO_file_jumps 得到函数 _IO_file_doallocate
/* Allocate a file buffer, or switch to unbuffered I/O. Streams for TTY devices default to line buffered. */ int _IO_file_doallocate (FILE *fp) { size_t size; char *p; struct __stat64_t64 st; size = BUFSIZ; if (fp->_fileno >= 0 && __builtin_expect (_IO_SYSSTAT (fp, &st), 0) >= 0) { if (S_ISCHR (st.st_mode)) { /* Possibly a tty. */ if ( #ifdef DEV_TTY_P DEV_TTY_P (&st) || #endif local_isatty (fp->_fileno)) fp->_flags |= _IO_LINE_BUF; } #if defined _STATBUF_ST_BLKSIZE if (st.st_blksize > 0 && st.st_blksize < BUFSIZ) size = st.st_blksize; #endif } p = malloc (size); if (__glibc_unlikely (p == NULL)) return EOF; _IO_setb (fp, p, p + size, 1); return 1; } libc_hidden_def (_IO_file_doallocate)
这里可以看到器默认缓冲大小设置,如果可以获取到文件系统的块大小,并且小于BUFSIZ, 那么缓冲区大小设置为文件系统blocksize。
也可以通过命令查看一下
garlic@garlic:~/sendfile$ stat a1.txt File: a1.txt Size: 536870912 Blocks: 1048584 IO Block: 4096 regular file Device: 253,0 Inode: 2122787 Links: 1 Access: (0664/-rw-rw-r--) Uid: ( 1000/ garlic) Gid: ( 1000/ garlic) Access: 2023-09-03 12:25:28.703755625 +0000 Modify: 2023-09-02 08:18:10.952596194 +0000 Change: 2023-09-02 08:18:10.952596194 +0000 Birth: 2023-09-02 08:18:10.952596194 +0000
由于标准文件库提供的缓存功能,这种涉及使用fgetc和fread两种方式, 在系统调用read,write次数是一样的。
下面可以尝试将fread缓冲区变小一下, 调整到1024
garlic@garlic:~/sendfile$ cat fread.c #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <sys/sendfile.h> #include <stdio.h> #include <unistd.h> #define BUF_SIZE 1024 int main(int argc, char **argv) { char buf[BUF_SIZE]; const char *fromfile = argv[1]; const char *tofile = argv[2]; FILE *fromfd = fopen(fromfile, "r"); FILE *tofd = fopen(tofile, "w"); size_t n; while ((n=fread(buf, 1, sizeof(buf), fromfd)) > 0) { fwrite(buf, 1, n, tofd); } fclose(fromfd); fclose(tofd); }
系统调用read write次数没有变化
garlic@garlic:~/sendfile$ fallocate -l 512M a2.txt garlic@garlic:~/sendfile$ sudo strace -c ./fread a2.txt b2.txt % time seconds usecs/call calls errors syscall ------ ----------- ----------- --------- --------- ---------------- 58.55 3.735239 28 131072 write 41.45 2.644221 20 131074 read 0.00 0.000021 5 4 close 0.00 0.000000 0 8 mmap 0.00 0.000000 0 3 mprotect 0.00 0.000000 0 1 munmap 0.00 0.000000 0 3 brk 0.00 0.000000 0 2 pread64 0.00 0.000000 0 1 1 access 0.00 0.000000 0 1 execve 0.00 0.000000 0 2 1 arch_prctl 0.00 0.000000 0 1 set_tid_address 0.00 0.000000 0 4 openat 0.00 0.000000 0 4 newfstatat 0.00 0.000000 0 1 set_robust_list 0.00 0.000000 0 1 prlimit64 0.00 0.000000 0 1 getrandom 0.00 0.000000 0 1 rseq ------ ----------- ----------- --------- --------- ---------------- 100.00 6.379481 24 262184 2 total
下面可以尝试将fread缓冲区变小一下, 调整到8192
garlic@garlic:~/sendfile$ cat fread.c #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <sys/sendfile.h> #include <stdio.h> #include <unistd.h> #define BUF_SIZE 8192 int main(int argc, char **argv) { char buf[BUF_SIZE]; const char *fromfile = argv[1]; const char *tofile = argv[2]; FILE *fromfd = fopen(fromfile, "r"); FILE *tofd = fopen(tofile, "w"); size_t n; while ((n=fread(buf, 1, sizeof(buf), fromfd)) > 0) { fwrite(buf, 1, n, tofd); } fclose(fromfd); fclose(tofd); }
garlic@garlic:~/sendfile$ sudo strace -c ./fread a2.txt b2.txt % time seconds usecs/call calls errors syscall ------ ----------- ----------- --------- --------- ---------------- 74.07 3.978765 30 131071 write 25.91 1.391691 21 65538 read 0.01 0.000424 424 1 execve 0.01 0.000277 34 8 mmap 0.00 0.000163 40 4 openat 0.00 0.000100 25 4 newfstatat 0.00 0.000091 30 3 mprotect 0.00 0.000086 21 4 close 0.00 0.000066 22 3 brk 0.00 0.000043 21 2 pread64 0.00 0.000042 21 2 1 arch_prctl 0.00 0.000037 37 1 munmap 0.00 0.000029 29 1 1 access 0.00 0.000026 26 1 set_tid_address 0.00 0.000021 21 1 getrandom 0.00 0.000020 20 1 prlimit64 0.00 0.000020 20 1 rseq 0.00 0.000019 19 1 set_robust_list ------ ----------- ----------- --------- --------- ---------------- 100.00 5.371920 27 196647 2 total
fread调用了_IO_sgetn -> _IO_XSGETN ->xsgetn ->_IO_file_xsgetn
size_t _IO_fread (void *buf, size_t size, size_t count, FILE *fp) { size_t bytes_requested = size * count; size_t bytes_read; CHECK_FILE (fp, 0); if (bytes_requested == 0) return 0; _IO_acquire_lock (fp); bytes_read = _IO_sgetn (fp, (char *) buf, bytes_requested); _IO_release_lock (fp); return bytes_requested == bytes_read ? count : bytes_read / size; }
size_t _IO_sgetn (FILE *fp, void *data, size_t n) { /* FIXME handle putback buffer here! */ return _IO_XSGETN (fp, data, n); } libc_hidden_def (_IO_sgetn)
#define _IO_XSGETN(FP, DATA, N) JUMP2 (__xsgetn, FP, DATA, N)
https://elixir.bootlin.com/glibc/glibc-2.38.9000/source/libio/fileops.c#L1272
这里面有段逻辑,根据需要读取的数据长度,这里就是我们fread例子中buf大小,按照文件系统块大小为单位取整。
_IO_file_xsgetn (FILE *fp, void *data, size_t n) { ... want = n; ... /* These must be set before the sysread as we might longjmp out waiting for input. */ _IO_setg (fp, fp->_IO_buf_base, fp->_IO_buf_base, fp->_IO_buf_base); _IO_setp (fp, fp->_IO_buf_base, fp->_IO_buf_base); /* Try to maintain alignment: read a whole number of blocks. */ count = want; if (fp->_IO_buf_base) { size_t block_size = fp->_IO_buf_end - fp->_IO_buf_base; if (block_size >= 128) count -= want % block_size; } count = _IO_SYSREAD (fp, s, count); if (count <= 0) { if (count == 0) fp->_flags |= _IO_EOF_SEEN; else fp->_flags |= _IO_ERR_SEEN; break; } s += count; want -= count; if (fp->_offset != _IO_pos_BAD) _IO_pos_adjust (fp->_offset, count); } } return n - want; } libc_hidden_def (_IO_file_xsgetn)
因为8192刚好是2个block大小。也就如果调整buf缓冲区的值到8192-8192+4096 那么还是按照8192读取,下面将缓冲大小调整为8192+4095看下效果
生成一个小些文件验证一下 , 可以看到确实还是使用的8192
garlic@garlic:~/sendfile$ fallocate -l 32K a1.txt garlic@garlic:~/sendfile$ sudo strace ./fread a1.txt b1.txt execve("./fread", ["./fread", "a1.txt", "b1.txt"], 0x7ffe99d374a0 /* 13 vars */) = 0 。。。 read(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192 read(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 4096) = 4096 newfstatat(4, "", {st_mode=S_IFREG|0644, st_size=0, ...}, AT_EMPTY_PATH) = 0 write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192 read(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192 read(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 4096) = 4096 write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 4096) = 4096 write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192 read(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192 read(3, "", 4096) = 0 write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 4096) = 4096 write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192 read(3, "", 8192) = 0 close(3) = 0 close(4) = 0 exit_group(0) = ? +++ exited with 0 +++b
当然我们也可以直接调用read, write, 绕开标准库, 当然这样就需要我们自己设置缓冲区。
#include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <sys/sendfile.h> #include <stdio.h> #include <unistd.h> #define BUF_SIZE 4096 int main(int argc, char **argv) { char buf[BUF_SIZE]; const char *fromfile = argv[1]; const char *tofile = argv[2]; struct stat stat_buf; int fromfd = open(fromfile, O_RDONLY); fstat(fromfd, &stat_buf); int tofd = open(tofile, O_WRONLY | O_CREAT , stat_buf.st_mode); int n = 1; while ((n = read(fromfd, &buf, sizeof(buf))) > 0) { write(tofd, &buf, n); } }
执行效果是一样的
garlic@garlic:~/sendfile$ fallocate -l 512M a2.txt garlic@garlic:~/sendfile$ sudo strace -c ./fread a2.txt b2.txt % time seconds usecs/call calls errors syscall ------ ----------- ----------- --------- --------- ---------------- 57.19 4.189182 31 131072 write 42.78 3.133982 23 131074 read 0.01 0.000536 536 1 execve 0.01 0.000369 46 8 mmap 0.00 0.000209 52 4 openat 0.00 0.000123 30 4 newfstatat 0.00 0.000118 39 3 mprotect 0.00 0.000084 28 3 brk 0.00 0.000076 19 4 close 0.00 0.000058 29 2 pread64 0.00 0.000052 26 2 1 arch_prctl 0.00 0.000048 48 1 munmap 0.00 0.000035 35 1 1 access 0.00 0.000027 27 1 getrandom 0.00 0.000026 26 1 prlimit64 0.00 0.000026 26 1 rseq 0.00 0.000025 25 1 set_tid_address 0.00 0.000025 25 1 set_robust_list ------ ----------- ----------- --------- --------- ---------------- 100.00 7.325001 27 262184 2 totalttiao
调小一下缓冲区, 调整到1024 512M的文件, API次数调用增多
garlic@garlic:~/sendfile$ sudo strace -c ./read a1.txt b1.txt % time seconds usecs/call calls errors syscall ------ ----------- ----------- --------- --------- ---------------- 52.54 12.925246 24 524288 write 47.46 11.676848 22 524290 read 0.00 0.000000 0 2 close 0.00 0.000000 0 8 mmap 0.00 0.000000 0 3 mprotect 0.00 0.000000 0 1 munmap 0.00 0.000000 0 1 brk 0.00 0.000000 0 2 pread64 0.00 0.000000 0 1 1 access 0.00 0.000000 0 1 execve 0.00 0.000000 0 2 1 arch_prctl 0.00 0.000000 0 1 set_tid_address 0.00 0.000000 0 4 openat 0.00 0.000000 0 3 newfstatat 0.00 0.000000 0 1 set_robust_list 0.00 0.000000 0 1 prlimit64 0.00 0.000000 0 1 rseq ------ ----------- ----------- --------- --------- ---------------- 100.00 24.602094 23 1048610 2 totalssheng
生成一个4K文件看下api调用情况,没有了标准库提供的缓冲区,按照指定大小读取
garlic@garlic:~/sendfile$ fallocate -l 4K a1.txt garlic@garlic:~/sendfile$ sudo strace ./read a1.txt b1.txt execve("./read", ["./read", "a1.txt", "b1.txt"], 0x7fff86f82e30 /* 13 vars */) = 0 。。。 newfstatat(3, "", {st_mode=S_IFREG|0664, st_size=4096, ...}, AT_EMPTY_PATH) = 0 openat(AT_FDCWD, "b1.txt", O_WRONLY|O_CREAT, 0100664) = 4 read(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1024) = 1024 write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1024) = 1024 read(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1024) = 1024 write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1024) = 1024 read(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1024) = 1024 write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1024) = 1024 read(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1024) = 1024 write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1024) = 1024 read(3, "", 1024) = 0 exit_group(0) = ? +++ exited with 0 +++
直接调用系统api更直接,但需要对系统能多了解。
sendfile
上面几种方案最终都是通过系统调用read, write来实现的,标准io通过缓存的方式,减少了频繁进行系统调用, 而系统调用是需要进行用户态(环3)到内核态切换(环0),需要cpu拷贝和上下文切换处理。
使用最后一个程序看下一个文件复制用户态和内核的相关处理
- 调用read 函数, 用户态切换到内核态, 内核调用IO操作;
- 内核通过虚拟文件系统接口调用磁盘文件系统(ext3 ext4),通过块设备驱动,从磁盘读取文件,并将文件内容拷贝到内核缓冲区;
- cpu将内核缓冲区内容拷贝到用户缓冲区,从内核态切换到用户态;
- 用户态程序获取返回后,调用write函数, 用户态切换到内核态, 内核调用IO操作;
- 内容通过虚拟文件系统接口写入磁盘, 从内核态切换到用户态;
- 获取write返回进行下一步操作。
使用trace-cmd看下内容处理流程,
sudo trace-cmd record -p function_graph ./read a1.txt b1.txt
32786 readtest-20845 0d..1. 59791.558934: function: exit_to_user_mode_prepare 32787 readtest-20845 0d..1. 59791.558935: function: fpregs_assert_state_consistent 32788 readtest-20845 0...1. 59791.558936: function: __x64_sys_read 32789 readtest-20845 0...1. 59791.558936: function: ksys_read 32790 readtest-20845 0...1. 59791.558936: function: __fdget_pos 32791 readtest-20845 0...1. 59791.558937: function: __fget_light 32792 readtest-20845 0...1. 59791.558937: function: vfs_read 32793 readtest-20845 0...1. 59791.558937: function: rw_verify_area 32794 readtest-20845 0...1. 59791.558938: function: security_file_permission 32795 readtest-20845 0...1. 59791.558938: function: apparmor_file_permission 32796 readtest-20845 0...1. 59791.558938: function: aa_file_perm 32797 readtest-20845 0...1. 59791.558938: function: __rcu_read_lock 32798 readtest-20845 0...1. 59791.558939: function: __rcu_read_unlock 32799 readtest-20845 0...1. 59791.558939: function: __fsnotify_parent 32800 readtest-20845 0...1. 59791.558939: function: __get_task_ioprio 32801 readtest-20845 0...1. 59791.558939: function: ext4_file_read_iter 32802 readtest-20845 0...1. 59791.558940: function: generic_file_read_iter 32803 readtest-20845 0...1. 59791.558940: function: filemap_read 32804 readtest-20845 0...1. 59791.558940: function: __cond_resched 32805 readtest-20845 0...1. 59791.558941: function: filemap_get_pages 32806 readtest-20845 0...1. 59791.558941: function: filemap_get_read_batch 32807 readtest-20845 0...1. 59791.558941: function: __rcu_read_lock 32808 readtest-20845 0...1. 59791.558942: function: __rcu_read_unlock 32809 readtest-20845 0...1. 59791.558942: function: folio_mark_accessed 32810 readtest-20845 0...1. 59791.558943: function: touch_atime 32811 readtest-20845 0...1. 59791.558943: function: atime_needs_update 32812 readtest-20845 0...1. 59791.558943: function: mnt_user_ns 32813 readtest-20845 0...1. 59791.558943: function: current_time 32814 readtest-20845 0...1. 59791.558944: function: ktime_get_coarse_real_ts64 32815 readtest-20845 0...1. 59791.558944: function: __fsnotify_parent 32816 readtest-20845 0d..1. 59791.558944: function: exit_to_user_mode_prepare 32817 readtest-20845 0d..1. 59791.558945: function: fpregs_assert_state_consistent 32818 readtest-20845 0...1. 59791.558946: function: __x64_sys_write 32819 readtest-20845 0...1. 59791.558946: function: ksys_write 32820 readtest-20845 0...1. 59791.558946: function: __fdget_pos 32821 readtest-20845 0...1. 59791.558946: function: __fget_light 32822 readtest-20845 0...1. 59791.558947: function: vfs_write 32823 readtest-20845 0...1. 59791.558947: function: rw_verify_area 32824 readtest-20845 0...1. 59791.558947: function: security_file_permission 32825 readtest-20845 0...1. 59791.558947: function: apparmor_file_permission 32826 readtest-20845 0...1. 59791.558947: function: aa_file_perm 32827 readtest-20845 0...1. 59791.558948: function: __rcu_read_lock 32828 readtest-20845 0...1. 59791.558948: function: __rcu_read_unlock 32829 readtest-20845 0...1. 59791.558948: function: __cond_resched 32830 readtest-20845 0...1. 59791.558948: function: __get_task_ioprio 32831 readtest-20845 0...1. 59791.558949: function: ext4_file_write_iter 32832 readtest-20845 0...1. 59791.558949: function: ext4_buffered_write_iterk
可以看到退出用户态进入内核态后, 通过系统调用到vfs_read->ext4_file_read_iter->generic_file_read_iter
下面看下sendfile的方案, sendfile最初写入一端针对是socket 2.6.33 版本后 后续版本也可以用普通文件
Before Linux 2.6.33, out_fd must refer to a socket. Since Linux 2.6.33 it can be any file. If it is a regular file, then sendfile() changes the file offset appropriately.
from https://man7.org/linux/man-pages/man2/sendfile.2.html
#include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <sys/sendfile.h> #include <stdio.h> #include <unistd.h> #define BUF_SIZE 4096 int main(int argc, char **argv) { char buf[BUF_SIZE]; const char *fromfile = argv[1]; const char *tofile = argv[2]; struct stat stat_buf; int fromfd = open(fromfile, O_RDONLY); fstat(fromfd, &stat_buf); int tofd = open(tofile, O_WRONLY | O_CREAT , stat_buf.st_mode); int n = 1; while (n > 0) { n = sendfile(tofd, fromfd, 0, BUF_SIZE); } close(fromfd); close(tofd); return 0; }
生成一个512M的文件。
garlic@garlic:~/sendfile$ fallocate -l 512M a1.txt garlic@garlic:~/sendfile$ sudo strace -c ./sendfile a1.txt b1.txt % time seconds usecs/call calls errors syscall ------ ----------- ----------- --------- --------- ---------------- 99.99 6.416219 48 131073 sendfile 0.00 0.000116 38 3 mprotect 0.00 0.000071 17 4 openat 0.00 0.000048 48 1 munmap 0.00 0.000039 13 3 newfstatat 0.00 0.000026 26 1 prlimit64 0.00 0.000025 12 2 1 arch_prctl 0.00 0.000025 25 1 set_tid_address 0.00 0.000025 25 1 set_robust_list 0.00 0.000025 25 1 rseq 0.00 0.000016 4 4 close 0.00 0.000000 0 1 read 0.00 0.000000 0 8 mmap 0.00 0.000000 0 1 brk 0.00 0.000000 0 2 pread64 0.00 0.000000 0 1 1 access 0.00 0.000000 0 1 execve ------ ----------- ----------- --------- --------- ---------------- 100.00 6.416635 48 131108 2 total
可以看到调用的sendfile,次数与 read, write一致, 看下内核调用的情况
garlic@garlic:~/sendfile$ sudo trace-cmd record -p function ./sendfile a.txt f.txt garlic@garlic:~/sendfile$ sudo trace-cmd report|grep sendfile > sendfile.report
sendfile-23316 [001] 80486.269854: function: __x64_sys_sendfile64 sendfile-23316 [001] 80486.269855: function: do_sendfile sendfile-23316 [001] 80486.269855: function: __fdget sendfile-23316 [001] 80486.269855: function: __fget_light sendfile-23316 [001] 80486.269855: function: rw_verify_area sendfile-23316 [001] 80486.269856: function: security_file_permission sendfile-23316 [001] 80486.269856: function: apparmor_file_permission sendfile-23316 [001] 80486.269856: function: aa_file_perm sendfile-23316 [001] 80486.269856: function: __rcu_read_lock sendfile-23316 [001] 80486.269857: function: __rcu_read_unlockyyo
就一个api所以完成了, 不涉及用户态和内核态的切换更换参数再看一下
garlic@garlic:~/sendfile$ sudo trace-cmd record -p function_graph ./sendfile a1.txt b1.txt
sendfile-23168 [001] 80298.350255: funcgraph_entry: | __x64_sys_sendfile64() { sendfile-23168 [001] 80298.350255: funcgraph_entry: | do_sendfile() { sendfile-23168 [001] 80298.350256: funcgraph_entry: | __fdget() { sendfile-23168 [001] 80298.350256: funcgraph_entry: 0.554 us | __fget_light(); sendfile-23168 [001] 80298.350257: funcgraph_exit: 1.415 us | } sendfile-23168 [001] 80298.350258: funcgraph_entry: | rw_verify_area() { sendfile-23168 [001] 80298.350259: funcgraph_entry: | security_file_permission() { sendfile-23168 [001] 80298.350260: funcgraph_entry: | apparmor_file_permission() { sendfile-23168 [001] 80298.350260: funcgraph_entry: | aa_file_perm() { sendfile-23168 [001] 80298.350261: funcgraph_entry: | __rcu_read_lock() { sendfile-23168 [001] 80298.350261: funcgraph_exit: 0.418 us | } sendfile-23168 [001] 80298.350262: funcgraph_entry: 0.419 us | __rcu_read_unlock(); sendfile-23168 [001] 80298.350263: funcgraph_exit: 2.418 us | } sendfile-23168 [001] 80298.350263: funcgraph_exit: 3.491 us | } sendfile-23168 [001] 80298.350264: funcgraph_entry: | __fsnotify_parent() { sendfile-23168 [001] 80298.350264: funcgraph_exit: 0.490 us | } sendfile-23168 [001] 80298.350264: funcgraph_exit: 5.726 us | } sendfile-23168 [001] 80298.350265: funcgraph_exit: 6.864 us | } sendfile-23168 [001] 80298.350265: funcgraph_entry: | __fdget() { sendfile-23168 [001] 80298.350266: funcgraph_entry: | __fget_light() { sendfile-23168 [001] 80298.350266: funcgraph_exit: 0.436 us | } sendfile-23168 [001] 80298.350266: funcgraph_exit: 1.213 us | } sendfile-23168 [001] 80298.350267: funcgraph_entry: | get_pipe_info() { sendfile-23168 [001] 80298.350267: funcgraph_exit: 0.469 us | } sendfile-23168 [001] 80298.350268: funcgraph_entry: | rw_verify_area() { sendfile-23168 [001] 80298.350268: funcgraph_entry: | security_file_permission() { sendfile-23168 [001] 80298.350269: funcgraph_entry: | apparmor_file_permission() { sendfile-23168 [001] 80298.350270: funcgraph_entry: | aa_file_perm() { sendfile-23168 [001] 80298.350270: funcgraph_entry: | __rcu_read_lock() { sendfile-23168 [001] 80298.350270: funcgraph_exit: 0.389 us | } sendfile-23168 [001] 80298.350271: funcgraph_entry: | __rcu_read_unlock() { sendfile-23168 [001] 80298.350271: funcgraph_exit: 0.382 us | } sendfile-23168 [001] 80298.350271: funcgraph_exit: 1.876 us | } sendfile-23168 [001] 80298.350272: funcgraph_exit: 2.888 us | } sendfile-23168 [001] 80298.350272: funcgraph_exit: 3.814 us | } sendfile-23168 [001] 80298.350273: funcgraph_exit: 4.747 us | } sendfile-23168 [001] 80298.350273: funcgraph_entry: | __cond_resched() { sendfile-23168 [001] 80298.350274: funcgraph_exit: 0.421 us | } sendfile-23168 [001] 80298.350274: funcgraph_entry: | do_splice_direct() { sendfile-23168 [001] 80298.350275: funcgraph_entry: | rw_verify_area() { sendfile-23168 [001] 80298.350275: funcgraph_entry: | security_file_permission() { sendfile-23168 [001] 80298.350276: funcgraph_entry: | apparmor_file_permission() { sendfile-23168 [001] 80298.350276: funcgraph_entry: | aa_file_perm() { sendfile-23168 [001] 80298.350277: funcgraph_entry: 0.399 us | __rcu_read_lock(); sendfile-23168 [001] 80298.350278: funcgraph_entry: | __rcu_read_unlock() { sendfile-23168 [001] 80298.350278: funcgraph_exit: 0.427 us | } sendfile-23168 [001] 80298.350278: funcgraph_exit: 2.002 us | } sendfile-23168 [001] 80298.350279: funcgraph_exit: 2.787 us | } sendfile-23168 [001] 80298.350280: funcgraph_exit: 4.558 us | } sendfile-23168 [001] 80298.350280: funcgraph_entry: | splice_direct_to_actor() { sendfile-23168 [001] 80298.350281: funcgraph_entry: | alloc_pipe_info() { sendfile-23168 [001] 80298.350282: funcgraph_entry: | kmalloc_trace() { sendfile-23168 [001] 80298.350282: funcgraph_entry: | __kmem_cache_alloc_node() { sendfile-23168 [001] 80298.350283: funcgraph_entry: | __cond_resched() { sendfile-23168 [001] 80298.350283: funcgraph_exit: 0.562 us | } sendfile-23168 [001] 80298.350284: funcgraph_entry: | should_failslab() { sendfile-23168 [001] 80298.350284: funcgraph_exit: 0.434 us | } sendfile-23168 [001] 80298.350285: funcgraph_entry: | __rcu_read_lock() { sendfile-23168 [001] 80298.350285: funcgraph_exit: 0.427 us | } sendfile-23168 [001] 80298.350286: funcgraph_entry: | __get_obj_cgroup_from_memcg() { sendfile-23168 [001] 80298.350286: funcgraph_entry: 0.414 us | __rcu_read_lock(); sendfile-23168 [001] 80298.350287: funcgraph_entry: | __rcu_read_unlock() { sendfile-23168 [001] 80298.350287: funcgraph_exit: 0.442 us | } sendfile-23168 [001] 80298.350288: funcgraph_exit: 2.228 us | } sendfile-23168 [001] 80298.350288: funcgraph_entry: | __rcu_read_unlock() { sendfile-23168 [001] 80298.350289: funcgraph_exit: 0.457 us | } sendfile-23168 [001] 80298.350289: funcgraph_entry: | obj_cgroup_charge() { sendfile-23168 [001] 80298.350290: funcgraph_entry: | consume_obj_stock() { sendfile-23168 [001] 80298.350290: funcgraph_exit: 0.448 us | } sendfile-23168 [001] 80298.350290: funcgraph_exit: 1.320 us | } sendfile-23168 [001] 80298.350292: funcgraph_entry: 0.448 us | __rcu_read_lock(); sendfile-23168 [001] 80298.350293: funcgraph_entry: | __rcu_read_unlock() { sendfile-23168 [001] 80298.350293: funcgraph_exit: 0.412 us | } sendfile-23168 [001] 80298.350294: funcgraph_entry: | mod_objcg_state() { sendfile-23168 [001] 80298.350294: funcgraph_exit: 0.468 us | } sendfile-23168 [001] 80298.350295: funcgraph_entry: | __rcu_read_lock() { sendfile-23168 [001] 80298.350295: funcgraph_exit: 0.422 us | } sendfile-23168 [001] 80298.350296: funcgraph_entry: | __rcu_read_unlock() { sendfile-23168 [001] 80298.350296: funcgraph_exit: 0.441 us | } sendfile-23168 [001] 80298.350296: funcgraph_exit: + 14.291 us | } sendfile-23168 [001] 80298.350297: funcgraph_exit: + 15.238 us | } sendfile-23168 [001] 80298.350298: funcgraph_entry: | __kmalloc() { sendfile-23168 [001] 80298.350298: funcgraph_entry: | kmalloc_slab() { sendfile-23168 [001] 80298.350299: funcgraph_exit: 0.604 us | } sendfile-23168 [001] 80298.350299: funcgraph_entry: | __kmem_cache_alloc_node() { sendfile-23168 [001] 80298.350300: funcgraph_entry: | __cond_resched() { sendfile-23168 [001] 80298.350300: funcgraph_exit: 0.506 us | } sendfile-23168 [001] 80298.350301: funcgraph_entry: | should_failslab() { sendfile-23168 [001] 80298.350301: funcgraph_exit: 0.454 us | } sendfile-23168 [001] 80298.350302: funcgraph_entry: | __rcu_read_lock() { sendfile-23168 [001] 80298.350302: funcgraph_exit: 0.502 us | } sendfile-23168 [001] 80298.350302: funcgraph_entry: | __get_obj_cgroup_from_memcg() { sendfile-23168 [001] 80298.350303: funcgraph_entry: | __rcu_read_lock() { sendfile-23168 [001] 80298.350303: funcgraph_exit: 0.435 us | }
可以看到有相关pipe操作, 通过管道复制完成。
splice_direct_to_actor – splices data directly between two non-pipes
https://manpages.debian.org/experimental/linux-manual-4.10/splice_direct_to_actor.9.en.html
写入调用direct_splice_actor->iter_file_splice_write
DMA
sendfile一般还会结合DMA技术来提升性能, sendfile减少了用户态和内核态切换,DMA控制器是负责DMA管理的外设,用于执行内存读取和写入操作而不占用CPU周期。当需要传输数据块时,处理器向DMA控制器提供源地址和目标地址以及总字节数。然后,DMA控制器会自动将数据从源传输到目标,而不会占用CPU周期。剩余字节数达到零时,块传输结束。 DMA 不仅可以减轻系统处理元件的负担,而且可以以比处理器读写更高的速率传输数据。
mmap+write
#include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <sys/sendfile.h> #include <stdio.h> #include <unistd.h> #include <sys/mman.h> #include <string.h> int main(int argc, char **argv) { const char *fromfile = argv[1]; const char *tofile = argv[2]; struct stat stat_buf; int fromfd = open(fromfile, O_RDONLY); fstat(fromfd, &stat_buf); int tofd = open(tofile, O_RDWR | O_CREAT | O_TRUNC , stat_buf.st_mode); size_t n = stat_buf.st_size; lseek(tofd, n-1, SEEK_SET); write(tofd, "", 1); void *s, *t; s = mmap(0, n, PROT_READ, MAP_SHARED, fromfd, 0); if (s == MAP_FAILED){ printf("read error\n"); } t = mmap(0, n, PROT_WRITE, MAP_SHARED, tofd, 0); if (t == MAP_FAILED){ printf("write error\n"); } memcpy(t, s, n); munmap(s, n); munmap(t, n); close(fromfd); close(tofd); return 0; }
同样用一个512M的文件验证一下。
garlic@garlic:~/sendfile$ fallocate -l 512M a1.txt garlic@garlic:~/sendfile$ sudo strace -c ./mmap a1.txt b1.txt % time seconds usecs/call calls errors syscall ------ ----------- ----------- --------- --------- ---------------- 55.00 0.196837 65612 3 munmap 44.80 0.160305 40076 4 close 0.04 0.000142 142 1 write 0.04 0.000139 13 10 mmap 0.04 0.000135 45 3 mprotect 0.03 0.000111 27 4 openat 0.01 0.000039 13 3 newfstatat 0.01 0.000026 26 1 lseek 0.01 0.000026 13 2 1 arch_prctl 0.01 0.000026 26 1 prlimit64 0.01 0.000025 25 1 set_tid_address 0.01 0.000025 25 1 set_robust_list 0.01 0.000025 25 1 rseq 0.00 0.000000 0 1 read 0.00 0.000000 0 1 brk 0.00 0.000000 0 2 pread64 0.00 0.000000 0 1 1 access 0.00 0.000000 0 1 execve ------ ----------- ----------- --------- --------- ---------------- 100.00 0.357861 8728 41 2 totalzz
可以看到系统调用非常的少,不过其映射长度以页为单位, 一般是4096K
#include <sys/mman.h> void *mmap(void addr[.length], size_t length, int prot, int flags, int fd, off_t offset); int munmap(void addr[.length], size_t length);
- addr: 映射的地址
- length: 映射的长度
- prot:模式,可读,可写, 可执行, 不能访问
- flag:私有或共享
- fd: 文件句柄
- offset:从文件句柄什么地方开始映射
与sendfile类似, 涉及与外部IO通讯的部分可以通过DMA来进行加速。
关于copy的优化还没有还有另外两个api
splice()
sendfile 我当前测试版本管道实现的, splice则要求一端需要时管道。
写一个例子
#define _GNU_SOURCE /* See feature_test_macros(7) */ #include <fcntl.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <sys/sendfile.h> #include <stdio.h> #include <unistd.h> #include <sys/mman.h> #include <string.h> int main(int argc, char **argv) { const char *fromfile = argv[1]; const char *tofile = argv[2]; struct stat stat_buf; int fromfd = open(fromfile, O_RDONLY); fstat(fromfd, &stat_buf); int tofd = open(tofile, O_RDWR | O_CREAT | O_TRUNC , stat_buf.st_mode); size_t n; int p[2]; pipe(p); while((n = splice(fromfd, 0, p[1], 0, 4096, SPLICE_F_MOVE | SPLICE_F_MORE)) > 0) { splice(p[0], 0, tofd, 0, n, SPLICE_F_MOVE | SPLICE_F_MORE); } close(p[0]); close(p[1]); close(fromfd); close(tofd); return 0; }
garlic@garlic:~/sendfile$ sudo strace -c ./splice a1.txt b1.txt % time seconds usecs/call calls errors syscall ------ ----------- ----------- --------- --------- ---------------- 99.96 135.609931 517 262145 splice 0.04 0.051841 12960 4 close 0.00 0.000150 37 4 openat 0.00 0.000132 44 3 mprotect 0.00 0.000050 50 1 munmap 0.00 0.000042 42 1 pipe2 0.00 0.000039 13 3 newfstatat 0.00 0.000026 26 1 prlimit64 0.00 0.000000 0 1 read 0.00 0.000000 0 8 mmap 0.00 0.000000 0 1 brk 0.00 0.000000 0 2 pread64 0.00 0.000000 0 1 1 access 0.00 0.000000 0 1 execve 0.00 0.000000 0 2 1 arch_prctl 0.00 0.000000 0 1 set_tid_address 0.00 0.000000 0 1 set_robust_list 0.00 0.000000 0 1 rseq ------ ----------- ----------- --------- --------- ---------------- 100.00 135.662211 517 262181 2 total
这个时间有点出乎意料非常的慢,
将strace -c 去掉
splice(5, NULL, 4, NULL, 4096, SPLICE_F_MOVE|SPLICE_F_MORE) = 4096 splice(3, NULL, 6, NULL, 4096, SPLICE_F_MOVE|SPLICE_F_MORE) = 4096 splice(5, NULL, 4, NULL, 4096, SPLICE_F_MOVE|SPLICE_F_MORE) = 4096
直接将4096改为 stat_buf.st_size 文件大小, 看来pipe最大buffer为65535
splice(5, NULL, 4, NULL, 65536, SPLICE_F_MOVE|SPLICE_F_MORE) = 65536 splice(3, NULL, 6, NULL, 536870912, SPLICE_F_MOVE|SPLICE_F_MORE) = 65536 splice(5, NULL, 4, NULL, 65536, SPLICE_F_MOVE|SPLICE_F_MORE) = 65536 splice(3, NULL, 6, NULL, 536870912, SPLICE_F_MOVE|SPLICE_F_MORE) = 65536 splice(5, NULL, 4, NULL, 65536, SPLICE_F_MOVE|SPLICE_F_MORE) = 65536
garlic@garlic:~/sendfile$ sudo strace -c ./splice a1.txt b1.txt % time seconds usecs/call calls errors syscall ------ ----------- ----------- --------- --------- ---------------- 82.70 1.441307 87 16385 splice 16.11 0.280795 46799 6 close 1.14 0.019827 4956 4 openat 0.02 0.000312 39 8 mmap 0.01 0.000130 43 3 mprotect 0.01 0.000108 108 1 1 access 0.01 0.000099 33 3 newfstatat 0.00 0.000056 28 2 pread64 0.00 0.000055 55 1 pipe2 0.00 0.000052 52 1 munmap 0.00 0.000033 33 1 rseq 0.00 0.000030 30 1 read 0.00 0.000026 26 1 prlimit64 0.00 0.000025 12 2 1 arch_prctl 0.00 0.000025 25 1 set_tid_address 0.00 0.000025 25 1 set_robust_list 0.00 0.000000 0 1 brk 0.00 0.000000 0 1 execve ------ ----------- ----------- --------- --------- ---------------- 100.00 1.742905 106 16423 2 totals
splice设计是为了减少复制,允许数据在不进行复制的情况下实现不同文件描述符间的共享。类似mmap。
splice主要是还有网络应用的场景,类似sendfile,
https://lwn.net/Articles/923237/ 这篇文章对splice功能一些问题, 主要是就操作时写入的问题, 和mmap类似, 当通过splice把文件通过网络发送给客户端时, 文件不应被修改。否则会出现内容错乱的问题。
首先,不可能直接将文件splice(拼接)到网络套接字中;splice()要求至少其中一个提供给它的文件描述符是一个管道(pipe)。因此,实际的操作顺序是将文件splice到管道中,然后使用第二个splice()调用将管道连接到套接字。两个splice()调用都不知道它们传递的数据何时到达最终目的地;即使两个splice()调用都已完成,网络层可能仍在处理文件数据。没有简单的方法来确定数据已传输并且可以安全地修改文件。
copy_file_range()
linux 4.5 版本已实现在同样文件系统下使用, 5.3-5.18中可以支持跨文件系统, 但使用时需要注意可能存在文件系统不支持, 拷贝失败但是返回成功
写一个copy_file_range的例子
garlic@garlic:~/sendfile$ sudo strace -c ./copyfilerange a1.txt b1.txt % time seconds usecs/call calls errors syscall ------ ----------- ----------- --------- --------- ---------------- 99.97 1.931937 1931937 1 copy_file_range 0.01 0.000262 65 4 close 0.01 0.000125 41 3 mprotect 0.00 0.000076 19 4 openat 0.00 0.000051 51 1 munmap 0.00 0.000051 17 3 newfstatat 0.00 0.000028 28 1 prlimit64 0.00 0.000000 0 1 read 0.00 0.000000 0 8 mmap 0.00 0.000000 0 1 brk 0.00 0.000000 0 2 pread64 0.00 0.000000 0 1 1 access 0.00 0.000000 0 1 execve 0.00 0.000000 0 2 1 arch_prctl 0.00 0.000000 0 1 set_tid_address 0.00 0.000000 0 1 set_robust_list 0.00 0.000000 0 1 rseq ------ ----------- ----------- --------- --------- ---------------- 100.00 1.932530 53681 36 2 totalk可以
可以看到这个实现一个api调用就搞定。
关于copy的性能对比这篇文章有更详细的性能测试
https://alexsaveau.dev/blog/performance/files/kernel/the-fastest-way-to-copy-a-file
可以看到大文件情况下,mmap的性能是优于copy_file_range的,但是copy_file_range更加简洁。方便使用。
关于高效拷贝stackflow有个帖子
https://stackoverflow.com/questions/7463689/most-efficient-way-to-copy-a-file-in-linux,
提到使用read/write方法设如果将缓冲设置为L1 cache大小可以做到接近zero-copy
lscpu Caches (sum of all): L1d: 64 KiB (2 instances) L1i: 64 KiB (2 instances) L2: 512 KiB (2 instances) L3: 12 MiB (2 instances)b
L1d 存放数据, L1i存放指令
把缓存调整到64K
garlic@garlic:~/sendfile$ sudo strace -c ./read a1.txt b1.txt % time seconds usecs/call calls errors syscall ------ ----------- ----------- --------- --------- ---------------- 61.52 0.405123 98 4096 write 38.37 0.252724 61 4098 read 0.03 0.000229 28 8 mmap 0.01 0.000090 30 3 newfstatat 0.01 0.000087 29 3 mprotect 0.01 0.000083 20 4 openat 0.01 0.000059 14 4 close 0.01 0.000035 35 1 munmap 0.01 0.000034 17 2 pread64 0.00 0.000028 28 1 rseq 0.00 0.000019 9 2 1 arch_prctl 0.00 0.000019 19 1 set_robust_list 0.00 0.000019 19 1 prlimit64 0.00 0.000018 18 1 set_tid_address 0.00 0.000000 0 1 brk 0.00 0.000000 0 1 1 access 0.00 0.000000 0 1 execve ------ ----------- ----------- --------- --------- ---------------- 100.00 0.658567 80 8228 2 total
另外可以同构posix_fadvise
和 posix_fallocate
对文件进行预读和空间分配。
Zero-Copy
“Zero-copy” describes computer operations in which the CPU does not perform the task of copying data from one memory area to another or in which unnecessary data copies are avoided. This is frequently used to save CPU cycles and memory bandwidth in many time consuming tasks, such as when transmitting a file at high speed over a network, etc., thus improving the performance of programs (processes) executed by a computer.[1][2][3][4]
“Zero-copy”描述了 CPU 不执行将数据从一个存储区域复制到另一存储区域的任务或避免不必要的数据复制的计算机操作。 这经常用于在许多耗时的任务中节省 CPU 周期和内存带宽,例如通过网络高速传输文件时等,从而提高计算机执行的程序(进程)的性能。
上面例子仅是api调用的调整, 在外设和内存间优化使用DMA技术, 读取数据直接由DMA完成,而不用频繁中断CPU去处理。
参考及引用
zero-copy
https://blog.devgenius.io/linux-zero-copy-d61d712813fe
https://www.hitzhangjie.pro/blog/2021-09-09-%E5%B8%B8%E8%A7%81%E7%9A%84%E9%9B%B6%E6%8B%B7%E8%B4%9D%E4%BC%98%E5%8C%96%E6%8A%80%E6%9C%AF/
https://alexsaveau.dev/blog/performance/files/kernel/the-fastest-way-to-copy-a-file 这边文章使用rust编写的例子,并进行了性能测试
Demonstration of splice, tee and AF_ALG hashing for zero-copy storage and data hash calculation
hxxps://gist.github.com/NicolasT/f2a0e537bc0c2329ae985ebd39bf58bb
https://blog.superpat.com/zero-copy-in-linux-with-sendfile-and-splice
glibc 系统调用分析:
http://terenceli.github.io/%E6%8A%80%E6%9C%AF/2019/02/17/glibc-syscall-wrapper
https://stackoverflow.com/questions/6655608/understanding-c-built-in-library-function-implementations
glibc IO
https://tttang.com/archive/1279/
trace-cmd
https://opensource.com/article/21/7/linux-kernel-trace-cmd
DMA
点击以访问 Pinchart–mastering_the_dma_and_iommu_apis.pdf
图片from洪鐘富
Comments are closed.