[OS] unlink() 시스템 콜 분석
linux kernel sourse tree의 깃허브 코드를 참조해 시스템 콜 호출 시 변화 과정을 분석한 글입니다.
📌 unlink()
- 파일 시스템에서 이름과 연결된 파일 (inode)에 대한 포인터를 제거하는 작업
- 디렉터리 엔트리 삭제
- 빈 디렉터리는 rmdir() 시스템콜 호출로 삭제, 일반 파일만 unlink()로 삭제
🫧 과정
.png)
- 경로 해석
- 권한 확인
- 파일 삭제
- inode 후처리
🫧 특징
- 중간에 lookup() 함수로 탐색을 하며 dentry → inode를 연결함
- 만약 dentry가 이미 존재하면 캐시된 dentry를 재활용함 (inode가 같을 수도, 다를 수도)
🫧 사용 예시 (코드)
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
int main(int argc, char *argv[])
{
int idx;
if(argc == 1) {
fprintf(stderr, "%s {file1} [file2] ...\n", argv[0]);
return 1;
}
for(idx = 1; idx < argc; idx++) {
if(unlink(argv[idx]) == -1) {
fprintf(stderr, "%s file 삭제 error: %s\n", argv[idx], stderror(errno));
} else {
printf("%s file이 삭제되었습니다.\n", argv[idx]);
}
}
return 0;
}
🫧 코드
✨ SYSCALL_DEFINE1
- /fs/namei.c, $4644
SYSCALL_DEFINE1(unlink, const char __user *, pathname)
{
return **do_unlinkat(AT_FDCWD, getname(pathname))**;
}
✨ do_unlinkat()
- /fs/namei.c, $4562
/*
* Make sure that the actual truncation of the file will occur outside its
* directory's i_mutex. Truncate can take a long time if there is a lot of
* writeout happening, and we don't want to prevent access to the directory
* while waiting on the I/O.
*/
int do_unlinkat(int dfd, struct filename *name)
{
int error;
struct dentry *dentry;
struct path path;
struct qstr last;
int type;
struct inode *inode = NULL;
struct inode *delegated_inode = NULL;
unsigned int lookup_flags = 0;
retry:
**// 1. 파일명을 부모 디렉터리와 파일명으로 분리**
error = **filename_parentat(dfd, name, lookup_flags, &path, &last, &type)**;
if (error)
goto exit1;
error = -EISDIR;
if (type != LAST_NORM)
goto exit2;
**// 2. 쓰기 권한 획득**
error = **mnt_want_write(path.mnt)**;
if (error)
goto exit2;
retry_deleg:
**// 3. 디렉터리 락 및 삭제할 dentry 탐색**
// 특정 inode에 대한 쓰기 락 획득
inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
// 삭제할 dentry 탐색
dentry = **lookup_one_qstr_excl(&last, path.dentry, lookup_flags)**;
error = PTR_ERR(dentry);
if (!IS_ERR(dentry)) {
/* Why not before? Because we want correct error value */
if (last.name[last.len] || d_is_negative(dentry))
goto slashes;
inode = dentry->d_inode;
ihold(inode);
error = security_path_unlink(&path, dentry);
if (error)
goto exit3;
**// 4. 실제 unlink 수행**
error = **vfs_unlink(mnt_idmap(path.mnt), path.dentry->d_inode,
dentry, &delegated_inode)**;
exit3:
dput(dentry);
}
inode_unlock(path.dentry->d_inode);
// i_nlink = 0이 되며 trancate 발생
// 필요 시 truncate 수행
if (inode)
iput(inode); /* truncate the inode here */
inode = NULL;
if (delegated_inode) {
error = break_deleg_wait(&delegated_inode);
if (!error)
goto retry_deleg;
}
mnt_drop_write(path.mnt);
exit2:
path_put(&path);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
inode = NULL;
goto retry;
}
exit1:
putname(name);
return error;
slashes:
if (d_is_negative(dentry))
error = -ENOENT;
else if (d_is_dir(dentry))
error = -EISDIR;
else
error = -ENOTDIR;
goto exit3;
}
✨ filename_parentat()
- /fs/namei.c, $2779
static int filename_parentat(int dfd, struct filename *name,
unsigned int flags, struct path *parent,
struct qstr *last, int *type)
{
return **__filename_parentat(dfd, name, flags, parent, last, type, NULL)**;
}
✨ __filename_parentat()
-
전체 경로를
부모 디렉터리+파일명으로 분리 -
/fs/namei.c, $2703
/* Note: this does not consume "name" */
static int __filename_parentat(int dfd, struct filename *name,
unsigned int flags, struct path *parent,
struct qstr *last, int *type,
const struct path *root)
{
int retval;
struct nameidata nd;
if (IS_ERR(name))
return PTR_ERR(name);
**// 1. nameidata 구조체 설정**
set_nameidata(&nd, dfd, name, root);
**// 2. 부모 경로로 이동 (경로 탐색 진행)**
retval = **path_parentat(&nd, flags | LOOKUP_RCU, parent)**;
if (unlikely(retval == -ECHILD))
retval = path_parentat(&nd, flags, parent);
if (unlikely(retval == -ESTALE))
retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent);
if (likely(!retval)) {
*last = nd.last;
*type = nd.last_type;
audit_inode(name, parent->dentry, AUDIT_INODE_PARENT);
}
restore_nameidata();
return retval;
}
✨ path_parentat()
-
경로 탐색 후 부모 path 반환
-
/fs/namei.c, $2687
/* Returns 0 and nd will be valid on success; Returns error, otherwise. */
static int path_parentat(struct nameidata *nd, unsigned flags,
struct path *parent)
{
**// 1. 경로 초기화 및 탐색**
const char *s = path_init(nd, flags); // open() 함수에 정리 완료
int err = link_path_walk(s, nd); // open() 함수에 정리되어 있음
if (!err)
**// 2. RCU 잠금 해제, 루트 범위 확인 등 경로 탐색 후 처리**
err = **complete_walk(nd**);
if (!err) {
**// 3. 부모 path 반환**
// unlink 하기 위해서는 부모 디렉터리 엔트리에서 해당 파일을 삭제해야 하므로 부모 path 필요
*parent = nd->path;
nd->path.mnt = NULL;
nd->path.dentry = NULL;
}
terminate_walk(nd);
return err;
}
✨ complete_walk()
-
경로 탐색 후 마지막 처리
- /fs/namei.c, $942
- RCU 잠금 해제, 루트 범위 확인 등 경로 탐색 후 처리
static int complete_walk(struct nameidata *nd)
{
struct dentry *dentry = nd->path.dentry;
int status;
if (nd->flags & LOOKUP_RCU) {
/*
* We don't want to zero nd->root for scoped-lookups or
* externally-managed nd->root.
*/
if (!(nd->state & ND_ROOT_PRESET))
if (!(nd->flags & LOOKUP_IS_SCOPED))
nd->root.mnt = NULL;
nd->flags &= ~LOOKUP_CACHED;
if (!try_to_unlazy(nd))
return -ECHILD;
}
if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
/*
* While the guarantee of LOOKUP_IS_SCOPED is (roughly) "don't
* ever step outside the root during lookup" and should already
* be guaranteed by the rest of namei, we want to avoid a namei
* BUG resulting in userspace being given a path that was not
* scoped within the root at some point during the lookup.
*
* So, do a final sanity-check to make sure that in the
* worst-case scenario (a complete bypass of LOOKUP_IS_SCOPED)
* we won't silently return an fd completely outside of the
* requested root to userspace.
*
* Userspace could move the path outside the root after this
* check, but as discussed elsewhere this is not a concern (the
* resolved file was inside the root at some point).
*/
if (!path_is_under(&nd->path, &nd->root))
return -EXDEV;
}
if (likely(!(nd->state & ND_JUMPED)))
return 0;
if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
return 0;
status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
if (status > 0)
return 0;
if (!status)
status = -ESTALE;
return status;
}
✨ mnt_want_write()
-
쓰기 작업 가능 여부 확인 및 잠금 (언마운트 되지 않도록 보호)
-
/fs/namespace.c, $511
int mnt_want_write(struct vfsmount *m)
{
int ret;
**// 1. superblock 쓰기 시작 선언 (락 X, count만 증가)**
sb_start_write(m->mnt_sb);
**// 2. 마운트 포인트에 쓰기 접근 권한 요청**
ret = mnt_get_write_access(m);
if (ret)
// 실패 시 롤백
sb_end_write(m->mnt_sb);
return ret;
}
EXPORT_SYMBOL_GPL(mnt_want_write);
✨ lookup_one_qstr_excl()
-
삭제할 dentry 탐색
- /fs/namei.c, $1673
- dir→i_op→lookup() 호출을 통해 시스템별 unlink() 함수 호출
/*
* Parent directory has inode locked exclusive. This is one
* and only case when ->lookup() gets called on non in-lookup
* dentries - as the matter of fact, this only gets called
* when directory is guaranteed to have no in-lookup children
* at all.
*/
struct dentry *lookup_one_qstr_excl(const struct qstr *name,
struct dentry *base,
unsigned int flags)
{
**// 1. 디렉터리 캐시에 파일명이 존재하는지 확인**
struct dentry *dentry = lookup_dcache(name, base, flags);
struct dentry *old;
struct inode *dir = base->d_inode;
if (dentry)
return dentry;
/* Don't create child dentry for a dead directory. */
// 부모 디렉터리가 삭제되었거나 unlinked된 경우 오류
if (unlikely(IS_DEADDIR(dir)))
return ERR_PTR(-ENOENT);
dentry = d_alloc(base, name);
if (unlikely(!dentry))
return ERR_PTR(-ENOMEM);
old = **dir->i_op->lookup(dir, dentry, flags)**;
if (unlikely(old)) {
dput(dentry);
dentry = old;
}
return dentry;
}
EXPORT_SYMBOL(lookup_one_qstr_excl);
const struct inode_operations ext4_dir_inode_operations = {
.create = ext4_create,
**.lookup = ext4_lookup**,
.link = ext4_link,
.unlink = ext4_unlink,
.symlink = ext4_symlink,
.mkdir = ext4_mkdir,
.rmdir = ext4_rmdir,
.mknod = ext4_mknod,
.tmpfile = ext4_tmpfile,
.rename = ext4_rename2,
.setattr = ext4_setattr,
.getattr = ext4_getattr,
.listxattr = ext4_listxattr,
.get_inode_acl = ext4_get_acl,
.set_acl = ext4_set_acl,
.fiemap = ext4_fiemap,
.fileattr_get = ext4_fileattr_get,
.fileattr_set = ext4_fileattr_set,
};
- ext4 디렉터리용 inode_operations 구조체에 lookup 포함. 이를 토대로 찾아서 ext4_lookup() 함수 호출
✨ ext4_lookup()
-
dentry 이름 -> inode로 매핑
-
/fs/ext4/namei.c, $1788
static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
struct inode *inode;
struct ext4_dir_entry_2 *de;
struct buffer_head *bh;
// 이름 길이 체크
if (dentry->d_name.len > EXT4_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
**// 1. 디렉터리 엔트리 탐색**
bh = **ext4_lookup_entry(dir, dentry, &de)**;
if (IS_ERR(bh))
return ERR_CAST(bh);
inode = NULL;
if (bh) {
__u32 ino = le32_to_cpu(de->inode);
brelse(bh);
if (!ext4_valid_inum(dir->i_sb, ino)) {
EXT4_ERROR_INODE(dir, "bad inode number: %u", ino);
return ERR_PTR(-EFSCORRUPTED);
}
if (unlikely(ino == dir->i_ino)) {
EXT4_ERROR_INODE(dir, "'%pd' linked to parent dir",
dentry);
return ERR_PTR(-EFSCORRUPTED);
}
**// 2. inode 추출 및 검사**
inode = ext4_iget(dir->i_sb, ino, EXT4_IGET_NORMAL);
if (inode == ERR_PTR(-ESTALE)) {
EXT4_ERROR_INODE(dir,
"deleted inode referenced: %u",
ino);
return ERR_PTR(-EFSCORRUPTED);
}
if (!IS_ERR(inode) && IS_ENCRYPTED(dir) &&
(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
!fscrypt_has_permitted_context(dir, inode)) {
ext4_warning(inode->i_sb,
"Inconsistent encryption contexts: %lu/%lu",
dir->i_ino, inode->i_ino);
iput(inode);
return ERR_PTR(-EPERM);
}
}
if (IS_ENABLED(CONFIG_UNICODE) && !inode && IS_CASEFOLDED(dir)) {
/* Eventually we want to call d_add_ci(dentry, NULL)
* for negative dentries in the encoding case as
* well. For now, prevent the negative dentry
* from being cached.
*/
return NULL;
}
// 3. inode와 dentry 연결
// 캐시된 dentry가 있을 경우 그걸 사용
return d_splice_alias(inode, dentry);
}
✨ ext4_lookup_entry()
-
특정 dentry를 찾아 해당 inode 번호 등의 정보 추출
-
/fs/ext4/namei.c, $1715
static struct buffer_head *ext4_lookup_entry(struct inode *dir,
struct dentry *dentry,
struct ext4_dir_entry_2 **res_dir)
{
int err;
struct ext4_filename fname;
struct buffer_head *bh;
**// 1. 이름 포맷**
err = ext4_fname_prepare_lookup(dir, dentry, &fname);
if (err == -ENOENT)
return NULL;
if (err)
return ERR_PTR(err);
**// 2. 엔트리 검색**
bh = __ext4_find_entry(dir, &fname, res_dir, NULL);
**// 3. 할당된 리소스 정리**
ext4_fname_free_filename(&fname);
return bh;
}
✨ vfs_unlink()
-
파일 삭제 함수
-
/fs/namei.c, $4511
/**
* vfs_unlink - unlink a filesystem object
* @idmap: idmap of the mount the inode was found from
* @dir: parent directory
* @dentry: victim
* @delegated_inode: returns victim inode, if the inode is delegated.
*
* The caller must hold dir->i_mutex.
*
* If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
* return a reference to the inode in delegated_inode. The caller
* should then break the delegation on that inode and retry. Because
* breaking a delegation may take a long time, the caller should drop
* dir->i_mutex before doing so.
*
* Alternatively, a caller may pass NULL for delegated_inode. This may
* be appropriate for callers that expect the underlying filesystem not
* to be NFS exported.
*
* If the inode has been found through an idmapped mount the idmap of
* the vfsmount must be passed through @idmap. This function will then take
* care to map the inode according to @idmap before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
* raw inode simply pass @nop_mnt_idmap.
*/
int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, struct inode **delegated_inode)
{
struct inode *target = dentry->d_inode;
**// 1. 삭제 권한 검사**
int error = may_delete(idmap, dir, dentry, 0);
if (error)
return error;
if (!dir->i_op->unlink)
return -EPERM;
**// 2. 락 걸기**
inode_lock(target);
if (IS_SWAPFILE(target))
error = -EPERM;
else if (is_local_mountpoint(dentry))
error = -EBUSY;
else {
error = security_inode_unlink(dir, dentry);
if (!error) {
error = try_break_deleg(target, delegated_inode);
if (error)
goto out;
**// 3. 삭제**
error = **dir->i_op->unlink(dir, dentry)**;
if (!error) {
dont_mount(dentry);
detach_mounts(dentry);
}
}
}
out:
inode_unlock(target);
/* We don't d_delete() NFS sillyrenamed files--they still exist. */
if (!error && dentry->d_flags & DCACHE_NFSFS_RENAMED) {
fsnotify_unlink(dir, dentry);
} else if (!error) {
**// 4. inode 링크 수 변경 및 해당 dentry 삭제를 알림**
fsnotify_link_count(target);
d_delete_notify(dir, dentry);
}
return error;
}
EXPORT_SYMBOL(vfs_unlink);
✨ ext4_unlink()
-
dir 디렉터리 내 dentry -> d_name 삭제 함수
-
fs/ext4/namei.c, $3308
static int ext4_unlink(struct inode *dir, struct dentry *dentry)
{
int retval;
if (unlikely(ext4_forced_shutdown(dir->i_sb)))
return -EIO;
trace_ext4_unlink_enter(dir, dentry);
/*
* Initialize quotas before so that eventual writes go
* in separate transaction
*/
retval = dquot_initialize(dir);
if (retval)
goto out_trace;
retval = dquot_initialize(d_inode(dentry));
if (retval)
goto out_trace;
retval = **__ext4_unlink(dir, &dentry->d_name, d_inode(dentry), dentry)**;
/* VFS negative dentries are incompatible with Encoding and
* Case-insensitiveness. Eventually we'll want avoid
* invalidating the dentries here, alongside with returning the
* negative dentries at ext4_lookup(), when it is better
* supported by the VFS for the CI case.
*/
if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
d_invalidate(dentry);
out_trace:
trace_ext4_unlink_exit(dentry, retval);
return retval;
}
✨ __ext4_unlink()
-
dentry 검색 및 제거, inode 변경사항 기록
-
fs/ext4/namei.c, $3235
int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
struct inode *inode,
struct dentry *dentry /* NULL during fast_commit recovery */)
{
int retval = -ENOENT;
struct buffer_head *bh;
struct ext4_dir_entry_2 *de;
handle_t *handle;
int skip_remove_dentry = 0;
/*
* Keep this outside the transaction; it may have to set up the
* directory's encryption key, which isn't GFP_NOFS-safe.
*/
**// 1. dentry 검색**
bh = ext4_find_entry(dir, d_name, &de, NULL);
if (IS_ERR(bh))
return PTR_ERR(bh);
if (!bh)
return -ENOENT;
**// 2. 찾은 디렉터리 엔트리의 inode와 삭제 대상 inode가 일치하는지 검사**
if (le32_to_cpu(de->inode) != inode->i_ino) {
/*
* It's okay if we find dont find dentry which matches
* the inode. That's because it might have gotten
* renamed to a different inode number
*/
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
skip_remove_dentry = 1;
else
goto out_bh;
}
**// 3. 저널 트랜잭션 시작**
handle = ext4_journal_start(dir, EXT4_HT_DIR, // 분석하고 싶었으나 시간이 부족해..
EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
if (IS_ERR(handle)) {
retval = PTR_ERR(handle);
goto out_bh;
}
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
if (!skip_remove_dentry) {
**// 4. 디렉터리 엔트리 제거**
retval = **ext4_delete_entry(handle, dir, de, bh)**;
if (retval)
goto out_handle;
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
ext4_update_dx_flag(dir);
retval = ext4_mark_inode_dirty(handle, dir);
if (retval)
goto out_handle;
} else {
retval = 0;
}
**// 5. 링크 -- 및 orphan 처리**
if (inode->i_nlink == 0)
ext4_warning_inode(inode, "Deleting file '%.*s' with no links",
d_name->len, d_name->name);
else
drop_nlink(inode);
if (!inode->i_nlink)
ext4_orphan_add(handle, inode);
inode_set_ctime_current(inode);
**// 6. inode 변경 사항 기록 (저널에 반영)**
retval = ext4_mark_inode_dirty(handle, inode);
if (dentry && !retval)
ext4_fc_track_unlink(handle, dentry);
out_handle:
ext4_journal_stop(handle);
out_bh:
brelse(bh);
return retval;
}
✨ ext4_delete_entry()
-
dentry 제거 및 inode=0 세팅
-
/fs/ext4/namei.c, $2721
static int ext4_delete_entry(handle_t *handle,
struct inode *dir,
struct ext4_dir_entry_2 *de_del,
struct buffer_head *bh)
{
int err, csum_size = 0;
**// 1. lnline 데이터 처리 (inode에 디렉터리 저장)**
if (ext4_has_inline_data(dir)) {
int has_inline_data = 1;
err = ext4_delete_inline_entry(handle, dir, de_del, bh,
&has_inline_data);
if (has_inline_data)
return err;
}
if (ext4_has_metadata_csum(dir->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
BUFFER_TRACE(bh, "get_write_access");
**// 2. 블록에 대한 쓰기 권한을 저널에 요청**
// 실패 시 수정 불가
err = ext4_journal_get_write_access(handle, dir->i_sb, bh,
EXT4_JTR_NONE);
if (unlikely(err))
goto out;
**// 3. 엔트리 제거 (inode = 0)**
err = ext4_generic_delete_entry(dir, de_del, bh, bh->b_data,
dir->i_sb->s_blocksize, csum_size);
if (err)
goto out;
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
err = ext4_handle_dirty_dirblock(handle, dir, bh);
if (unlikely(err))
goto out;
return 0;
out:
if (err != -ENOENT)
ext4_std_error(dir->i_sb, err);
return err;
}