[OS] write() 시스템 콜 분석
linux kernel sourse tree의 깃허브 코드를 참조해 시스템 콜 호출 시 변화 과정을 분석한 글입니다.
📌 write()
- open() 시스템 콜로 획득한 file descriptor을 통해 데이터를 기록하는 데 사용되는 함수
- 파일에 데이터를 쓰거나, 표준 출력으로 데이터 출력 시 사용
🫧 과정
.png)
- 오프셋 검사
- 권한 검사 및 영역 유효성 검사
- 쓰기 방식에 따라 쓰기 작업 수행
🫧 특징
- ext4_buffered_write_iter()에서는 비동기적으로 지원 X
🫧 사용 예시 (코드)
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
int main() {
int fd = open("TEXT", O_RDWR | O_CREAT, 0666);
if(fd == -1) {
fprintf(stderr, "Failed to open file.\n");
exit(0);
}
const char *buf = "Welcome to VALLHALLA!\n";
// writes buf --> fd
ssize_t nr = **write(fd, buf, strlen(buf))**;
if(nr == -1) {
fprintf(stderr, "Failed to write file.\n");
exit(0);
}
else printf("Writing Success!\n");
return 0;
}
🫧 코드
✨ SYSCALL_DEFINE3
- fs/namei.c, $739
SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
size_t, count)
{
return **ksys_write(fd, buf, count)**;
}
✨ ksys_write()
- 오프셋 확인 후 open() 함수 호출
- fs/namei.c, $720
ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
{
CLASS(fd_pos, f)(fd);
ssize_t ret = -EBADF;
if (!fd_empty(f)) {
**// 1. 현재 파일의 오프셋 가져오기**
loff_t pos, *ppos = file_ppos(fd_file(f));
if (ppos) {
pos = *ppos;
ppos = &pos;
}
**// 2. ext4_open() 함수로 위임**
ret = **vfs_write(fd_file(f), buf, count, ppos)**;
if (ret >= 0 && ppos)
fd_file(f)->f_pos = pos;
}
return ret;
}
✨ vfs_write()
- 권한 검사 및 ext4 쓰기 함수 호출
- fs/read_write.c, $659
ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
ssize_t ret;
**// 1. 권한 검사**
if (!(file->f_mode & FMODE_WRITE))
return -EBADF;
if (!(file->f_mode & FMODE_CAN_WRITE))
return -EINVAL;
**// 사용자 공간 접근 가능성 확인**
if (unlikely(!access_ok(buf, count)))
return -EFAULT;
**// 파일 오프셋 등 영역이 유효한지 확인**
ret = rw_verify_area(WRITE, file, pos, count);
if (ret)
return ret;
if (count > MAX_RW_COUNT)
count = MAX_RW_COUNT;
**// 2. 쓰기 작업 선언**
file_start_write(file);
if (file->f_op->write)
ret = **file->f_op->write(file, buf, count, pos)**;
else if (file->f_op->write_iter)
ret = new_sync_write(file, buf, count, pos);
else
ret = -EINVAL;
if (ret > 0) {
fsnotify_modify(file);
add_wchar(current, ret);
}
inc_syscw(current);
file_end_write(file);
return ret;
}
✨ ext4_file_write_iter()
- 쓰기 방식에 따른 각기 다른 함수 호출 (DAX, DIO, Buffer)
- /fs/ext4/file.c, $688
static ssize_t
ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct inode *inode = file_inode(iocb->ki_filp);
if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return -EIO;
#ifdef CONFIG_FS_DAX
if (IS_DAX(inode))
return ext4_dax_write_iter(iocb, from);
#endif
if (iocb->ki_flags & IOCB_ATOMIC) {
size_t len = iov_iter_count(from);
int ret;
if (len < EXT4_SB(inode->i_sb)->s_awu_min ||
len > EXT4_SB(inode->i_sb)->s_awu_max)
return -EINVAL;
ret = generic_atomic_write_valid(iocb, from);
if (ret)
return ret;
}
if (iocb->ki_flags & IOCB_DIRECT)
return **ext4_dio_write_iter(iocb, from)**;
else
return **ext4_buffered_write_iter(iocb, from)**;
}
- DAX : Direct Access (사용자 버퍼 → 장치 메모리)
- DIO : Direct I/O (사용자 버퍼 → 블록 디바이스)
- Buffered I/O : 일반 파일 쓰기 (사용자 버퍼 → 페이지 캐시 → 추후에 디스크 쓰기 진행)
✨ ext4_dio_write_iter()
- Direct I/O 작업에서의 write() 시스템 콜 처리
- /fs/ext4/file.c, $499
static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
ssize_t ret;
handle_t *handle;
struct inode *inode = file_inode(iocb->ki_filp);
loff_t offset = iocb->ki_pos;
size_t count = iov_iter_count(from);
const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
bool extend = false, unwritten = false;
bool ilock_shared = true;
int dio_flags = 0;
/*
* Quick check here without any i_rwsem lock to see if it is extending
* IO. A more reliable check is done in ext4_dio_write_checks() with
* proper locking in place.
*/
if (offset + count > i_size_read(inode))
ilock_shared = false;
if (iocb->ki_flags & IOCB_NOWAIT) {
if (ilock_shared) {
if (!inode_trylock_shared(inode))
return -EAGAIN;
} else {
if (!inode_trylock(inode))
return -EAGAIN;
}
} else {
if (ilock_shared)
inode_lock_shared(inode);
else
inode_lock(inode);
}
/* Fallback to buffered I/O if the inode does not support direct I/O. */
if (!ext4_should_use_dio(iocb, from)) {
if (ilock_shared)
inode_unlock_shared(inode);
else
inode_unlock(inode);
return ext4_buffered_write_iter(iocb, from);
}
/*
* Prevent inline data from being created since we are going to allocate
* blocks for DIO. We know the inode does not currently have inline data
* because ext4_should_use_dio() checked for it, but we have to clear
* the state flag before the write checks because a lock cycle could
* introduce races with other writers.
*/
ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend,
&unwritten, &dio_flags);
if (ret <= 0)
return ret;
offset = iocb->ki_pos;
count = ret;
if (extend) {
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out;
}
ret = ext4_orphan_add(handle, inode);
ext4_journal_stop(handle);
if (ret)
goto out;
}
if (ilock_shared && !unwritten)
iomap_ops = &ext4_iomap_overwrite_ops;
ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
dio_flags, NULL, 0);
if (ret == -ENOTBLK)
ret = 0;
if (extend) {
/*
* We always perform extending DIO write synchronously so by
* now the IO is completed and ext4_handle_inode_extension()
* was called. Cleanup the inode in case of error or race with
* writeback of delalloc blocks.
*/
WARN_ON_ONCE(ret == -EIOCBQUEUED);
ext4_inode_extension_cleanup(inode, ret < 0);
}
out:
if (ilock_shared)
inode_unlock_shared(inode);
else
inode_unlock(inode);
if (ret >= 0 && iov_iter_count(from)) {
ssize_t err;
loff_t endbyte;
/*
* There is no support for atomic writes on buffered-io yet,
* we should never fallback to buffered-io for DIO atomic
* writes.
*/
WARN_ON_ONCE(iocb->ki_flags & IOCB_ATOMIC);
offset = iocb->ki_pos;
err = ext4_buffered_write_iter(iocb, from);
if (err < 0)
return err;
/*
* We need to ensure that the pages within the page cache for
* the range covered by this I/O are written to disk and
* invalidated. This is in attempt to preserve the expected
* direct I/O semantics in the case we fallback to buffered I/O
* to complete off the I/O request.
*/
ret += err;
endbyte = offset + err - 1;
err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping,
offset, endbyte);
if (!err)
invalidate_mapping_pages(iocb->ki_filp->f_mapping,
offset >> PAGE_SHIFT,
endbyte >> PAGE_SHIFT);
}
return ret;
}
✨ ext4_buffered_write_iter()
- fs/ext4/file.c, $285
static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
struct iov_iter *from)
{
ssize_t ret;
struct inode *inode = file_inode(iocb->ki_filp);
if (iocb->ki_flags & IOCB_NOWAIT)
return -EOPNOTSUPP;
// 1. inode lock 획득
inode_lock(inode);
ret = ext4_write_checks(iocb, from);
if (ret <= 0)
goto out;
// 2. 쓰기 작업 수행
ret = generic_perform_write(iocb, from);
out:
inode_unlock(inode);
if (unlikely(ret <= 0))
return ret;
return **generic_write_sync(iocb, ret)**;
}
✨ generic_write_sync()
- 쓰기 함수 (동기식 쓰기 요청일 경우 flush까지 수행)
- /linux/fs.h, $2901
/*
* Sync the bytes written if this was a synchronous write. Expect ki_pos
* to already be updated for the write, and will return either the amount
* of bytes passed in, or an error if syncing the file failed.
*/
static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count)
{
if (iocb_is_dsync(iocb)) {
int ret = vfs_fsync_range(iocb->ki_filp,
iocb->ki_pos - count, iocb->ki_pos - 1,
(iocb->ki_flags & IOCB_SYNC) ? 0 : 1);
if (ret)
return ret;
}
return count;
}
✨ ext4_write_begin()
- /fs/ext4/inode.c, $1140
static int ext4_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len,
struct folio **foliop, void **fsdata)
{
struct inode *inode = mapping->host;
int ret, needed_blocks;
handle_t *handle;
int retries = 0;
struct folio *folio;
pgoff_t index;
unsigned from, to;
if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return -EIO;
trace_ext4_write_begin(inode, pos, len);
/*
* Reserve one block more for addition to orphan list in case
* we allocate blocks but write fails for some reason
*/
needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
index = pos >> PAGE_SHIFT;
from = pos & (PAGE_SIZE - 1);
to = from + len;
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
foliop);
if (ret < 0)
return ret;
if (ret == 1)
return 0;
}
/*
* __filemap_get_folio() can take a long time if the
* system is thrashing due to memory pressure, or if the folio
* is being written back. So grab it first before we start
* the transaction handle. This also allows us to allocate
* the folio (if needed) without using GFP_NOFS.
*/
retry_grab:
folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
mapping_gfp_mask(mapping));
if (IS_ERR(folio))
return PTR_ERR(folio);
/*
* The same as page allocation, we prealloc buffer heads before
* starting the handle.
*/
if (!folio_buffers(folio))
create_empty_buffers(folio, inode->i_sb->s_blocksize, 0);
folio_unlock(folio);
retry_journal:
**// 1. 저널 기록 시작**
handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
if (IS_ERR(handle)) {
folio_put(folio);
return PTR_ERR(handle);
}
folio_lock(folio);
if (folio->mapping != mapping) {
/* The folio got truncated from under us */
folio_unlock(folio);
folio_put(folio);
ext4_journal_stop(handle);
goto retry_grab;
}
/* In case writeback began while the folio was unlocked */
folio_wait_stable(folio);
if (ext4_should_dioread_nolock(inode))
**// 2. 쓰기를 위한 데이터 블록 할당**
ret = ext4_block_write_begin(handle, folio, pos, len,
ext4_get_block_unwritten);
else
ret = ext4_block_write_begin(handle, folio, pos, len,
ext4_get_block);
if (!ret && ext4_should_journal_data(inode)) {
**// 3. 블록들을 저널에 등록**
ret = ext4_walk_page_buffers(handle, inode,
folio_buffers(folio), from, to,
NULL, do_journal_get_write_access);
}
if (ret) {
bool extended = (pos + len > inode->i_size) &&
!ext4_verity_in_progress(inode);
folio_unlock(folio);
/*
* ext4_block_write_begin may have instantiated a few blocks
* outside i_size. Trim these off again. Don't need
* i_size_read because we hold i_rwsem.
*
* Add inode to orphan list in case we crash before
* truncate finishes
*/
if (extended && ext4_can_truncate(inode))
ext4_orphan_add(handle, inode);
ext4_journal_stop(handle);
if (extended) {
ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might
* still be on the orphan list; we need to
* make sure the inode is removed from the
* orphan list in that case.
*/
if (inode->i_nlink)
ext4_orphan_del(NULL, inode);
}
if (ret == -ENOSPC &&
ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry_journal;
folio_put(folio);
return ret;
}
*foliop = folio;
return ret;
}