linux kernel源码剖析共享内存部分 IPC 虚拟内存映射 VMA shmget shmat shmdt shmctl

实现进程间通信的方法很多，比如：

管道（同一台机器上两个进程双向通信）

套接字（不同机器上的两个进程间的通信）

System IPC机制（同一机器上，许多进程相互通信）IPC有以下三个机制：

1消息队列：

用于信息传递频繁且内容较少的进程间通信

2 信号量：

用于实现进程间同步通信

3 共享内存：

用于信息内容比较多的进程间通信

内核中实现IPC子系统的源码组织结构：

其中msg.h msg.c组成第一种IPC机制消息队列，sem.h sem.c组成信号量，shm.h,shm.c组成共享内存机制。util.h,util.c将三种独立的机制结合起来，统一初始化，统一管理标识。

IPC是一组系统调用，且每个IPC资源是一个32位的关键字（key），和一个32的唯一标识（identifer）。每个IPC资源都有一个ipc_perm的数据结构记录了IPC关键字，创建者和所有者的UID，GID，读写权限，seq为IPC标识符的当前序号信息。

struct ipc_perm
{
__kernel_key_tkey;
__kernel_uid_tuid;
__kernel_gid_tgid;
__kernel_uid_tcuid;
__kernel_gid_tcgid;
__kernel_mode_tmode; 
unsigned shortseq;
};

创建机制的函数分别为msget（）、semget（）、shmget（），创建成功后都会返回一个唯一的IPC标识，使用完成后可根据这个标识释放资源。

共享内存API：

shmget（）创建一块共享内存

shmat（）将已经存在的共享内存映射到进程地址空间

shmdt（）取消该映射

shmctl（）根据操作管理共享内存

int shmget（key_t key,int size,int shmflg）

成功返回共享内存标示符，失败返回-1；

key唯一标识键值，有两种情况会创建新的共享内存;

a.key值设为IPC_PRIVATE。

b.key值为0，但是shmflag为CREATE

size建立共享内存的长度

内存的分配以页为单位，size为1~4096之间都会分配一页4k，页数的整数倍。

shmflag IPC_CREATE IPC_EXECL

每创建好一块共享内存。都会用shmid_ds去维护：

struct shmid_ds {
struct ipc_permshm_perm; /* operation perms */
int shm_segsz;/* size of segment (bytes) */
__kernel_time_tshm_atime; /* last attach time */
__kernel_time_tshm_dtime; /* last detach time */
__kernel_time_tshm_ctime; /* last change time */
__kernel_ipc_pid_tshm_cpid; /* pid of creator */
__kernel_ipc_pid_tshm_lpid; /* pid of last operator */
unsigned shortshm_nattch; /* no. of current attaches */
unsigned short shm_unused; /* compatibility */
void *shm_unused2;/* ditto - used by DIPC */
void *shm_unused3;/* unused */
};

void *shmat（int shmid,const void *shmaddr,int shmflg）

成功返回链接的地址，失败返回-1；调用完成后，shm_nattch += 1；

shmid

shmget返回的内存标识

shmaddr

把shmid标识的内存连接到进程数据段，根据shmaddr值可分为：

shmaddr = 0;内核从1~1.5G范围从高位到低位自动选择一块空闲的未被映射的内存，并返回地址。

shmaddr != 0,shmflg未设置SHM_RND，以shmaddr为连接地址

shmaddr = 0，shmflag设置了SHM_RND，shmaddr自动规整为SHMLAB的整数倍。

shmflg

设置SHM_RND同时可设置SHM_RDONLY标志位，改变读写权限，以只读方式绑定。

int shmdt（const void *shmaddr）

返回值跟shmat相似。调用完成后，shm_nattch -= 1；

int shmctl（int shmid, int cmd, struct shmid_ds *buf）

cmd 将要控制的操作

IPC_STAT 将shmid_ds复制到缓冲区buf。

IPC_RMID标记共享内存已销毁，最后一个连接分离也即shm_nattch = 0；时销毁。

IPC机制中共享内存的工作流程

IPC中系统创建共享内存的函数和函数之间的调用关系。

在util.c里面有ipc_init函数：

static int __init ipc_init(void)
{
sem_init();
msg_init();
shm_init();
return 0;
}

初始化了三种IPC机制，其中shm_init是初始化共享内存的。以下进入shm_init函数详细介绍。

void __init shm_init (void)
{
ipc_init_ids(&shm_ids, 1);
ipc_init_proc_interface("sysvipc/shm",
"       key      shmid perms       size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime\n",
&shm_ids,
sysvipc_shm_proc_show);
}

1、先看的shm_ids结构体

static struct ipc_ids shm_ids;
struct ipc_ids {
int in_use;
int max_id;
unsigned short seq;
unsigned short seq_max;
struct semaphore sem;
struct ipc_id_ary nullentry;
struct ipc_id_ary* entries;
};

ipc_ids里面包括了指向保存IPC资源的指针数组p

struct ipc_id_ary {
int size;
struct kern_ipc_perm *p[0];
};

p是kern_ipc_perm*类型

/* used by in-kernel data structures */
struct kern_ipc_perm
{
spinlock_t lock;
int deleted;
key_t key;
uid_t uid;
gid_t gid;
uid_t cuid;
gid_t cgid;
mode_t mode; 
unsigned long seq;
void *security;
};

kern存储资源的基本信息，包括key值，创建者、拥有者的UID GID以及访问权限，当前唯一标识的顺序编号seq。

其中spinlock_t是一个自旋锁，类型如下：

typedef struct {
raw_spinlock_t raw_lock;
#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)
unsigned int break_lock;
#endif
#ifdef CONFIG_DEBUG_SPINLOCK
unsigned int magic, owner_cpu;
void *owner;
#endif
} spinlock_t;

2、跟踪到ipc_init_ids函数

设置ids结构体中的参数，开辟资源。

/**
 * ipc_init_ids- initialise IPC identifiers
 * @ids: Identifier set
 * @size: Number of identifiers
 *
 * Given a size for the ipc identifier range (limited below IPCMNI)
 * set up the sequence range to use then allocate and initialise the
 * array itself. 
 */
 
void __init ipc_init_ids(struct ipc_ids* ids, int size)
{
int i;
sema_init(&ids->sem,1);


if(size > IPCMNI)
size = IPCMNI;
ids->in_use = 0;
ids->max_id = -1;
ids->seq = 0;
{
int seq_limit = INT_MAX/SEQ_MULTIPLIER;
if(seq_limit > USHRT_MAX)
ids->seq_max = USHRT_MAX;
else
ids->seq_max = seq_limit;
}


ids->entries = ipc_rcu_alloc(sizeof(struct kern_ipc_perm *)*size +
    sizeof(struct ipc_id_ary));


if(ids->entries == NULL) {
printk(KERN_ERR "ipc_init_ids() failed, ipc service disabled.\n");
size = 0;
ids->entries = &ids->nullentry;
}
ids->entries->size = size;
for(i=0;i<size;i++)
ids->entries->p[i] = NULL;
}

为rcu头结构和对象分配内存，返回对象的指针,分配失败返回NULL。

/**
 * ipc_rcu_alloc- allocate ipc and rcu space 
 * @size: size desired
 *
 * Allocate memory for the rcu header structure +  the object.
 * Returns the pointer to the object.
 * NULL is returned if the allocation fails. 
 */
 
void* ipc_rcu_alloc(int size)
{
void* out;
/* 
* We prepend the allocation with the rcu struct, and
* workqueue if necessary (for vmalloc). 
*/
if (rcu_use_vmalloc(size)) {
out = vmalloc(HDRLEN_VMALLOC + size);
if (out) {
out += HDRLEN_VMALLOC;
container_of(out, struct ipc_rcu_hdr, data)->is_vmalloc = 1;
container_of(out, struct ipc_rcu_hdr, data)->refcount = 1;
}
} else {
out = kmalloc(HDRLEN_KMALLOC + size, GFP_KERNEL);
if (out) {
out += HDRLEN_KMALLOC;
container_of(out, struct ipc_rcu_hdr, data)->is_vmalloc = 0;
container_of(out, struct ipc_rcu_hdr, data)->refcount = 1;
}
}


return out;
}

3、跟踪ipc_init_proc_interface函数

在进函数之前，先看参数seq_file结构体

struct seq_file {
char *buf;
size_t size;
size_t from;
size_t count;
loff_t index;
loff_t version;
struct semaphore sem;
struct seq_operations *op;
void *private;
};

设置参数，创建接口函数ipc_init_proc_interface

/**
 * ipc_init_proc_interface-  Create a proc interface for sysipc types
 *   using a seq_file interface.
 * @path: Path in procfs
 * @header: Banner to be printed at the beginning of the file.
 * @ids: ipc id table to iterate.
 * @show: show routine.
 */
void __init ipc_init_proc_interface(const char *path, const char *header,
   struct ipc_ids *ids,
   int (*show)(struct seq_file *, void *))
{
struct proc_dir_entry *pde;
struct ipc_proc_iface *iface;


iface = kmalloc(sizeof(*iface), GFP_KERNEL);
if (!iface)
return;
iface->path = path;
iface->header = header;
iface->ids = ids;
iface->show = show;


pde = create_proc_entry(path,
S_IRUGO,        /* world readable */
NULL            /* parent dir */);
if (pde) {
pde->data = iface;
pde->proc_fops = &sysvipc_proc_fops;
} else {
kfree(iface);
}
}

在此函数中调用了三个函数，kmalloc,create_proc_entry,kfree.

kmalloc是内存分配函数

static inline void *kmalloc(size_t size, gfp_t flags)
{
if (__builtin_constant_p(size)) {
int i = 0;
#define CACHE(x) \
if (size <= x) \
goto found; \
else \
i++;
#include "kmalloc_sizes.h"
#undef CACHE
{
extern void __you_cannot_kmalloc_that_much(void);
__you_cannot_kmalloc_that_much();
}
found:
return kmem_cache_alloc((flags & GFP_DMA) ?
malloc_sizes[i].cs_dmacachep :
malloc_sizes[i].cs_cachep, flags);
}
return __kmalloc(size, flags);
}

create_proc_entry是一个内联函数。

static inline struct proc_dir_entry *create_proc_entry(const char *name,
mode_t mode, struct proc_dir_entry *parent) { return NULL; }

第一个参数path，path是从ipc_init_proc_interface函数接收来的路径，在该路径下创建目录。

第二个参数 mode，在调用的时候传入的参数为S_IRUGO表示全局可读，也即权限是400 040 004.

第三个参数 parent，标识是否有父目录。

kfree释放内存

/**
 * kfree - free previously allocated memory
 * @objp: pointer returned by kmalloc.
 *
 * If @objp is NULL, no operation is performed.
 *
 * Don't free memory not originally allocated by kmalloc()
 * or you will run into trouble.
 */
void kfree(const void *objp)
{
kmem_cache_t *c;
unsigned long flags;


if (unlikely(!objp))
return;
local_irq_save(flags);
kfree_debugcheck(objp);
c = page_get_cache(virt_to_page(objp));
__cache_free(c, (void*)objp);
local_irq_restore(flags);
}
EXPORT_SYMBOL(kfree);

上述ipc_init_proc_interface函数中调用的pro_dir_entry结构体

struct proc_dir_entry {
unsigned int low_ino;
unsigned short namelen;
const char *name;
mode_t mode;
nlink_t nlink;
uid_t uid;
gid_t gid;
unsigned long size;
struct inode_operations * proc_iops;
struct file_operations * proc_fops;
get_info_t *get_info;
struct module *owner;
struct proc_dir_entry *next, *parent, *subdir;
void *data;
read_proc_t *read_proc;
write_proc_t *write_proc;
atomic_t count;/* use count */
int deleted; /* delete flag */
void *set;
};

inode_opreations结构体

struct inode_operations {
int (*create) (struct inode *,struct dentry *,int, struct nameidata *);
struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *);
int (*link) (struct dentry *,struct inode *,struct dentry *);
int (*unlink) (struct inode *,struct dentry *);
int (*symlink) (struct inode *,struct dentry *,const char *);
int (*mkdir) (struct inode *,struct dentry *,int);
int (*rmdir) (struct inode *,struct dentry *);
int (*mknod) (struct inode *,struct dentry *,int,dev_t);
int (*rename) (struct inode *, struct dentry *,
struct inode *, struct dentry *);
int (*readlink) (struct dentry *, char __user *,int);
void * (*follow_link) (struct dentry *, struct nameidata *);
void (*put_link) (struct dentry *, struct nameidata *, void *);
void (*truncate) (struct inode *);
int (*permission) (struct inode *, int, struct nameidata *);
int (*setattr) (struct dentry *, struct iattr *);
int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
ssize_t (*listxattr) (struct dentry *, char *, size_t);
int (*removexattr) (struct dentry *, const char *);
};

其中目录信息的结构体dentry

struct dentry {
atomic_t d_count;
unsigned int d_flags;/* protected by d_lock */
spinlock_t d_lock;/* per dentry lock */
struct inode *d_inode;/* Where the name belongs to - NULL is
* negative */
/*
* The next three fields are touched by __d_lookup.  Place them here
* so they all fit in a cache line.
*/
struct hlist_node d_hash;/* lookup hash list */
struct dentry *d_parent;/* parent directory */
struct qstr d_name;


struct list_head d_lru;/* LRU list */
struct list_head d_child;/* child of parent list */
struct list_head d_subdirs;/* our children */
struct list_head d_alias;/* inode alias list */
unsigned long d_time;/* used by d_revalidate */
struct dentry_operations *d_op;
struct super_block *d_sb;/* The root of the dentry tree */
void *d_fsdata;/* fs-specific data */
  struct rcu_head d_rcu;
struct dcookie_struct *d_cookie; /* cookie, if any */
int d_mounted;
unsigned char d_iname[DNAME_INLINE_LEN_MIN];/* small names */
};
dentry_opreations结构体
struct dentry_operations {
int (*d_revalidate)(struct dentry *, struct nameidata *);
int (*d_hash) (struct dentry *, struct qstr *);
int (*d_compare) (struct dentry *, struct qstr *, struct qstr *);
int (*d_delete)(struct dentry *);
void (*d_release)(struct dentry *);
void (*d_iput)(struct dentry *, struct inode *);
};


/* the dentry parameter passed to d_hash and d_compare is the parent
 * directory of the entries to be compared. It is used in case these
 * functions need any directory specific information for determining
 * equivalency classes.  Using the dentry itself might not work, as it
 * might be a negative dentry which has no information associated with
 * it */


/*
locking rules:
big lock dcache_lock d_lock   may block
d_revalidate: nono no       yes
d_hash no no no       yes
d_compare: no yes yes      no
d_delete: no yes no       no
d_release: no no no       yes
d_iput: no no no       yes
 */

结构体ipc_proc_iface

struct ipc_proc_iface*iface;

struct ipc_proc_iface {
const char *path;
const char *header;
struct ipc_ids *ids;
int (*show)(struct seq_file *, void *);
};

其中涉及的ipc_ids和seq_file在上面已提到。

跟踪到sysvipc_shm_proc_show函数

static int sysvipc_shm_proc_show(struct seq_file *s, void *it)
{
struct shmid_kernel *shp = it;
char *format;


#define SMALL_STRING "%10d %10d  %4o %10u %5u %5u  %5d %5u %5u %5u %5u %10lu %10lu %10lu\n"
#define BIG_STRING   "%10d %10d  %4o %21u %5u %5u  %5d %5u %5u %5u %5u %10lu %10lu %10lu\n"


if (sizeof(size_t) <= sizeof(int))
format = SMALL_STRING;
else
format = BIG_STRING;
return seq_printf(s, format,
 shp->shm_perm.key,
 shp->id,
 shp->shm_flags,
 shp->shm_segsz,
 shp->shm_cprid,
 shp->shm_lprid,
 is_file_hugepages(shp->shm_file) ? (file_count(shp->shm_file) - 1) : shp->shm_nattch,
 shp->shm_perm.uid,
 shp->shm_perm.gid,
 shp->shm_perm.cuid,
 shp->shm_perm.cgid,
 shp->shm_atim,
 shp->shm_dtim,
 shp->shm_ctim);
}

传入标示符seq结构体信息，并显示system V IPC资源信息。并用seq_printf打印出来。

int seq_printf(struct seq_file *m, const char *f, ...)
{
va_list args;
int len;


if (m->count < m->size) {
va_start(args, f);
len = vsnprintf(m->buf + m->count, m->size - m->count, f, args);
va_end(args);
if (m->count + len < m->size) {
m->count += len;
return 0;
}
}
m->count = m->size;
return -1;
}
EXPORT_SYMBOL(seq_printf);

这里seq_printf是一个可变参函数。传入多个参数，并调用vsnprintf函数打印。

系统API：

shm API调用整体的流程图如下：

四个函数都是被sys_ipc函数管理：

/*
 * sys_ipc() is the de-multiplexer for the SysV IPC calls..
 *
 * This is really horribly ugly.
 */
asmlinkage int sys_ipc(uint call, int first, int second, int third,
      void __user *ptr, long fifth)
{
int version, ret;


version = call >> 16; /* hack for backward compatibility */
call &= 0xffff;


switch (call) {
case SEMOP:
return sys_semtimedop (first, (struct sembuf __user *)ptr, second, NULL);
case SEMTIMEDOP:
return sys_semtimedop(first, (struct sembuf __user *)ptr, second,
(const struct timespec __user *)fifth);


case SEMGET:
return sys_semget (first, second, third);
case SEMCTL: {
union semun fourth;
if (!ptr)
return -EINVAL;
if (get_user(fourth.__pad, (void __user * __user *) ptr))
return -EFAULT;
return sys_semctl (first, second, third, fourth);
}


case MSGSND:
return sys_msgsnd(first, (struct msgbuf __user *) ptr, 
 second, third);
case MSGRCV:
switch (version) {
case 0: {
struct ipc_kludge tmp;
if (!ptr)
return -EINVAL;
if (copy_from_user(&tmp,(struct ipc_kludge __user *)ptr,
  sizeof (tmp)))
return -EFAULT;
return sys_msgrcv (first, tmp.msgp, second,
  tmp.msgtyp, third);
}
default:
return sys_msgrcv (first,
  (struct msgbuf __user *) ptr,
  second, fifth, third);
}
case MSGGET:
return sys_msgget ((key_t) first, second);
case MSGCTL:
return sys_msgctl(first, second, (struct msqid_ds __user *)ptr);


case SHMAT:
switch (version) {
default: {
ulong raddr;
ret = do_shmat(first, (char __user *)ptr, second, &raddr);
if (ret)
return ret;
return put_user(raddr, (ulong __user *)third);
}
case 1: /* Of course, we don't support iBCS2! */
return -EINVAL;
}
case SHMDT: 
return sys_shmdt ((char __user *)ptr);
case SHMGET:
return sys_shmget (first, second, third);
case SHMCTL:
return sys_shmctl (first, second,
  (struct shmid_ds __user *) ptr);
default:
return -ENOSYS;
}
}

一直不懂irix是什么东东，维基百科的解释：

IRIX是由Silicon Graphics（SGI）开发的在其MIPS工作站和服务器上本机运行的停产操作系统。它基于具有BSD扩展的UNIX系统V。 IRIX是第一个包含XFS文件系统的操作系统。

sys_shmget

asmlinkage long sys_shmget (key_t key, size_t size, int shmflg)
{
struct shmid_kernel *shp;
int err, id = 0;


down(&shm_ids.sem);
if (key == IPC_PRIVATE) {
err = newseg(key, shmflg, size);
} else if ((id = ipc_findkey(&shm_ids, key)) == -1) {
if (!(shmflg & IPC_CREAT))
err = -ENOENT;
else
err = newseg(key, shmflg, size);
} else if ((shmflg & IPC_CREAT) && (shmflg & IPC_EXCL)) {
err = -EEXIST;
} else {
shp = shm_lock(id);
if(shp==NULL)
BUG();
if (shp->shm_segsz < size)
err = -EINVAL;
else if (ipcperms(&shp->shm_perm, shmflg))
err = -EACCES;
else {
int shmid = shm_buildid(id, shp->shm_perm.seq);
err = security_shm_associate(shp, shmflg);
if (!err)
err = shmid;
}
shm_unlock(shp);
}
up(&shm_ids.sem);


return err;
}

newseg创建新的键值为key，大小为size的共享内存，并返回创建好的共享内存的标示符。

static int newseg (key_t key, int shmflg, size_t size)
{
int error;
struct shmid_kernel *shp;
int numpages = (size + PAGE_SIZE -1) >> PAGE_SHIFT;
struct file * file;
char name[13];
int id;


if (size < SHMMIN || size > shm_ctlmax)
return -EINVAL;


if (shm_tot + numpages >= shm_ctlall)
return -ENOSPC;


shp = ipc_rcu_alloc(sizeof(*shp));
if (!shp)
return -ENOMEM;


shp->shm_perm.key = key;
shp->shm_flags = (shmflg & S_IRWXUGO);
shp->mlock_user = NULL;


shp->shm_perm.security = NULL;
error = security_shm_alloc(shp);
if (error) {
ipc_rcu_putref(shp);
return error;
}


if (shmflg & SHM_HUGETLB) {
/* hugetlb_zero_setup takes care of mlock user accounting */
file = hugetlb_zero_setup(size);
shp->mlock_user = current->user;
} else {
int acctflag = VM_ACCOUNT;
/*
* Do not allow no accounting for OVERCOMMIT_NEVER, even
* if it's asked for.
*/
if  ((shmflg & SHM_NORESERVE) &&
sysctl_overcommit_memory != OVERCOMMIT_NEVER)
acctflag = 0;
sprintf (name, "SYSV%08x", key);
file = shmem_file_setup(name, size, acctflag);
}
error = PTR_ERR(file);
if (IS_ERR(file))
goto no_file;


error = -ENOSPC;
id = shm_addid(shp);
if(id == -1) 
goto no_id;


shp->shm_cprid = current->tgid;
shp->shm_lprid = 0;
shp->shm_atim = shp->shm_dtim = 0;
shp->shm_ctim = get_seconds();
shp->shm_segsz = size;
shp->shm_nattch = 0;
shp->id = shm_buildid(id,shp->shm_perm.seq);
shp->shm_file = file;
file->f_dentry->d_inode->i_ino = shp->id;


/* Hugetlb ops would have already been assigned. */
if (!(shmflg & SHM_HUGETLB))
file->f_op = &shm_file_operations;


shm_tot += numpages;
shm_unlock(shp);
return shp->id;


no_id:
fput(file);
no_file:
security_shm_free(shp);
ipc_rcu_putref(shp);
return error;
}

寻找一个ID，这个ID是数组P的下标，这个数组管理者所有的共享内存。

/**
 * ipc_findkey- find a key in an ipc identifier set
 * @ids: Identifier set
 * @key: The key to find
 * 
 * Requires ipc_ids.sem locked.
 * Returns the identifier if found or -1 if not.
 */
 
int ipc_findkey(struct ipc_ids* ids, key_t key)
{
int id;
struct kern_ipc_perm* p;
int max_id = ids->max_id;


/*
* rcu_dereference() is not needed here
* since ipc_ids.sem is held
*/
for (id = 0; id <= max_id; id++) {
p = ids->entries->p[id];
if(p==NULL)
continue;
if (key == p->key)
return id;
}
return -1;
}

shmat函数

asmlinkage long sys_shmat(int shmid, char __user *shmaddr, int shmflg)
{
unsigned long ret;
long err;


err = do_shmat(shmid, shmaddr, shmflg, &ret);
if (err)
return err;
force_successful_syscall_return();
return (long)ret;
}

主要是在do_shmat函数中实现挂载

/*
 * Fix shmaddr, allocate descriptor, map shm, add attach descriptor to lists.
 *
 * NOTE! Despite the name, this is NOT a direct system call entrypoint. The
 * "raddr" thing points to kernel space, and there has to be a wrapper around
 * this.
 */
long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr)
{
struct shmid_kernel *shp;
unsigned long addr;
unsigned long size;
struct file * file;
int    err;
unsigned long flags;
unsigned long prot;
unsigned long o_flags;
int acc_mode;
void *user_addr;


if (shmid < 0) {
err = -EINVAL;
goto out;
} else if ((addr = (ulong)shmaddr)) {
if (addr & (SHMLBA-1)) {
if (shmflg & SHM_RND)
addr &= ~(SHMLBA-1);  /* round down */
else
#ifndef __ARCH_FORCE_SHMLBA
if (addr & ~PAGE_MASK)
#endif
return -EINVAL;
}
flags = MAP_SHARED | MAP_FIXED;
} else {
if ((shmflg & SHM_REMAP))
return -EINVAL;


flags = MAP_SHARED;
}


if (shmflg & SHM_RDONLY) {
prot = PROT_READ;
o_flags = O_RDONLY;
acc_mode = S_IRUGO;
} else {
prot = PROT_READ | PROT_WRITE;
o_flags = O_RDWR;
acc_mode = S_IRUGO | S_IWUGO;
}
if (shmflg & SHM_EXEC) {
prot |= PROT_EXEC;
acc_mode |= S_IXUGO;
}


/*
* We cannot rely on the fs check since SYSV IPC does have an
* additional creator id...
*/
shp = shm_lock(shmid);
if(shp == NULL) {
err = -EINVAL;
goto out;
}
err = shm_checkid(shp,shmid);
if (err) {
shm_unlock(shp);
goto out;
}
if (ipcperms(&shp->shm_perm, acc_mode)) {
shm_unlock(shp);
err = -EACCES;
goto out;
}


err = security_shm_shmat(shp, shmaddr, shmflg);
if (err) {
shm_unlock(shp);
return err;
}

file = shp->shm_file;
size = i_size_read(file->f_dentry->d_inode);
shp->shm_nattch++;
shm_unlock(shp);


down_write(¤t->mm->mmap_sem);
if (addr && !(shmflg & SHM_REMAP)) {
user_addr = ERR_PTR(-EINVAL);
if (find_vma_intersection(current->mm, addr, addr + size))
goto invalid;
/*
* If shm segment goes below stack, make sure there is some
* space left for the stack to grow (at least 4 pages).
*/
if (addr < current->mm->start_stack &&
   addr > current->mm->start_stack - size - PAGE_SIZE * 5)
goto invalid;
}

user_addr = (void*) do_mmap (file, addr, size, prot, flags, 0);


invalid:
up_write(¤t->mm->mmap_sem);


down (&shm_ids.sem);
if(!(shp = shm_lock(shmid)))
BUG();
shp->shm_nattch--;
if(shp->shm_nattch == 0 &&
  shp->shm_flags & SHM_DEST)
shm_destroy (shp);
else
shm_unlock(shp);
up (&shm_ids.sem);


*raddr = (unsigned long) user_addr;
err = 0;
if (IS_ERR(user_addr))
err = PTR_ERR(user_addr);
out:
return err;
}

先检查传入的shmid共享内存是否存在，存在则去检查它的访问权限。修改该共享内存shmid_kernel中的数据（比如连接数shm_nattch加1）

down_write开始映射，把共享内存连接到进程虚拟地址空间。

shmdt函数

asmlinkage long sys_shmdt(char __user *shmaddr)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *next;
unsigned long addr = (unsigned long)shmaddr;
loff_t size = 0;
int retval = -EINVAL;


down_write(&mm->mmap_sem);


/*
* This function tries to be smart and unmap shm segments that
* were modified by partial mlock or munmap calls:
* - It first determines the size of the shm segment that should be
*   unmapped: It searches for a vma that is backed by shm and that
*   started at address shmaddr. It records it's size and then unmaps
*   it.
* - Then it unmaps all shm vmas that started at shmaddr and that
*   are within the initially determined size.
* Errors from do_munmap are ignored: the function only fails if
* it's called with invalid parameters or if it's called to unmap
* a part of a vma. Both calls in this function are for full vmas,
* the parameters are directly copied from the vma itself and always
* valid - therefore do_munmap cannot fail. (famous last words?)
*/
/*
* If it had been mremap()'d, the starting address would not
* match the usual checks anyway. So assume all vma's are
* above the starting address given.
*/
vma = find_vma(mm, addr);


while (vma) {
next = vma->vm_next;


/*
* Check if the starting address would match, i.e. it's
* a fragment created by mprotect() and/or munmap(), or it
* otherwise it starts at this address with no hassles.
*/
if ((vma->vm_ops == &shm_vm_ops || is_vm_hugetlb_page(vma)) &&
(vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) {




size = vma->vm_file->f_dentry->d_inode->i_size;
do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
/*
* We discovered the size of the shm segment, so
* break out of here and fall through to the next
* loop that uses the size information to stop
* searching for matching vma's.
*/
retval = 0;
vma = next;
break;
}
vma = next;
}


/*
* We need look no further than the maximum address a fragment
* could possibly have landed at. Also cast things to loff_t to
* prevent overflows and make comparisions vs. equal-width types.
*/
while (vma && (loff_t)(vma->vm_end - addr) <= size) {
next = vma->vm_next;


/* finding a matching vma now does not alter retval */
if ((vma->vm_ops == &shm_vm_ops || is_vm_hugetlb_page(vma)) &&
(vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff)


do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
vma = next;
}


up_write(&mm->mmap_sem);
return retval;
}

shmdt断开进程与虚拟地址shmaddr的共享内存的连接。

首先找到进程虚拟空间中地址为shmaddr的VMA（linux管理进程虚拟地址空间的数据结构）。

/*
 * Look up the first VMA which satisfies  addr < vm_end,  NULL if none
 */
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
struct vm_list_struct *vml;


for (vml = mm->context.vmlist; vml; vml = vml->next)
if (addr >= vml->vma->vm_start && addr < vml->vma->vm_end)
return vml->vma;


return NULL;
}


EXPORT_SYMBOL(find_vma);

这个函数的返回值是vm_area_struct类型，vm_area_struct结构体存储进程虚拟地址信息。

/*
 * This struct defines a memory VMM memory area. There is one of these
 * per VM-area/task.  A VM area is any part of the process virtual memory
 * space that has a special rule for the page-fault handlers (ie a shared
 * library, the executable area etc).
 */
struct vm_area_struct {
struct mm_struct * vm_mm;/* The address space we belong to. */
unsigned long vm_start;/* Our start address within vm_mm. */
unsigned long vm_end;/* The first byte after our end address
  within vm_mm. */


/* linked list of VM areas per task, sorted by address */
struct vm_area_struct *vm_next;


pgprot_t vm_page_prot;/* Access permissions of this VMA. */
unsigned long vm_flags;/* Flags, listed below. */


struct rb_node vm_rb;


/*
* For areas with an address space and backing store,
* linkage into the address_space->i_mmap prio tree, or
* linkage to the list of like vmas hanging off its node, or
* linkage of vma in the address_space->i_mmap_nonlinear list.
*/
union {
struct {
struct list_head list;
void *parent; /* aligns with prio_tree_node parent */
struct vm_area_struct *head;
} vm_set;


struct raw_prio_tree_node prio_tree_node;
} shared;


/*
* A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
* list, after a COW of one of the file pages.  A MAP_SHARED vma
* can only be in the i_mmap tree.  An anonymous MAP_PRIVATE, stack
* or brk vma (with NULL file) can only be in an anon_vma list.
*/
struct list_head anon_vma_node;/* Serialized by anon_vma->lock */
struct anon_vma *anon_vma;/* Serialized by page_table_lock */


/* Function pointers to deal with this struct. */
struct vm_operations_struct * vm_ops;


/* Information about our backing store: */
unsigned long vm_pgoff;/* Offset (within vm_file) in PAGE_SIZE
  units, *not* PAGE_CACHE_SIZE */
struct file * vm_file;/* File we map to (can be NULL). */
void * vm_private_data;/* was vm_pte (shared mem) */
unsigned long vm_truncate_count;/* truncate_count or restart_addr */


#ifndef CONFIG_MMU
atomic_t vm_usage;/* refcount (VMAs shared if !MMU) */
#endif
#ifdef CONFIG_NUMA
struct mempolicy *vm_policy;/* NUMA policy for the VMA */
#endif
};

接下来，调用do_mumap函数，断开连接

int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
{
struct vm_list_struct *vml, **parent;
unsigned long end = addr + len;


#ifdef DEBUG
printk("do_munmap:\n");
#endif


for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next)
if ((*parent)->vma->vm_start == addr &&
   ((len == 0) || ((*parent)->vma->vm_end == end)))
goto found;


printk("munmap of non-mmaped memory by process %d (%s): %p\n",
      current->pid, current->comm, (void *) addr);
return -EINVAL;


 found:
vml = *parent;


put_vma(vml->vma);


*parent = vml->next;
realalloc -= kobjsize(vml);
askedalloc -= sizeof(*vml);
kfree(vml);


update_hiwater_vm(mm);
mm->total_vm -= len >> PAGE_SHIFT;


#ifdef DEBUG
show_process_blocks();
#endif


return 0;
}

shmctl函数

asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds __user *buf)
{
struct shm_setbuf setbuf;
struct shmid_kernel *shp;
int err, version;


if (cmd < 0 || shmid < 0) {
err = -EINVAL;
goto out;
}


version = ipc_parse_version(&cmd);


switch (cmd) { /* replace with proc interface ? */
case IPC_INFO:
{
struct shminfo64 shminfo;


err = security_shm_shmctl(NULL, cmd);
if (err)
return err;


memset(&shminfo,0,sizeof(shminfo));
shminfo.shmmni = shminfo.shmseg = shm_ctlmni;
shminfo.shmmax = shm_ctlmax;
shminfo.shmall = shm_ctlall;


shminfo.shmmin = SHMMIN;
if(copy_shminfo_to_user (buf, &shminfo, version))
return -EFAULT;
/* reading a integer is always atomic */
err= shm_ids.max_id;
if(err<0)
err = 0;
goto out;
}
case SHM_INFO:
{
struct shm_info shm_info;


err = security_shm_shmctl(NULL, cmd);
if (err)
return err;


memset(&shm_info,0,sizeof(shm_info));
down(&shm_ids.sem);
shm_info.used_ids = shm_ids.in_use;
shm_get_stat (&shm_info.shm_rss, &shm_info.shm_swp);
shm_info.shm_tot = shm_tot;
shm_info.swap_attempts = 0;
shm_info.swap_successes = 0;
err = shm_ids.max_id;
up(&shm_ids.sem);
if(copy_to_user (buf, &shm_info, sizeof(shm_info))) {
err = -EFAULT;
goto out;
}


err = err < 0 ? 0 : err;
goto out;
}
case SHM_STAT:
case IPC_STAT:
{
struct shmid64_ds tbuf;
int result;
memset(&tbuf, 0, sizeof(tbuf));
shp = shm_lock(shmid);
if(shp==NULL) {
err = -EINVAL;
goto out;
} else if(cmd==SHM_STAT) {
err = -EINVAL;
if (shmid > shm_ids.max_id)
goto out_unlock;
result = shm_buildid(shmid, shp->shm_perm.seq);
} else {
err = shm_checkid(shp,shmid);
if(err)
goto out_unlock;
result = 0;
}
err=-EACCES;
if (ipcperms (&shp->shm_perm, S_IRUGO))
goto out_unlock;
err = security_shm_shmctl(shp, cmd);
if (err)
goto out_unlock;
kernel_to_ipc64_perm(&shp->shm_perm, &tbuf.shm_perm);
tbuf.shm_segsz= shp->shm_segsz;
tbuf.shm_atime= shp->shm_atim;
tbuf.shm_dtime= shp->shm_dtim;
tbuf.shm_ctime= shp->shm_ctim;
tbuf.shm_cpid = shp->shm_cprid;
tbuf.shm_lpid = shp->shm_lprid;
if (!is_file_hugepages(shp->shm_file))
tbuf.shm_nattch= shp->shm_nattch;
else
tbuf.shm_nattch = file_count(shp->shm_file) - 1;
shm_unlock(shp);
if(copy_shmid_to_user (buf, &tbuf, version))
err = -EFAULT;
else
err = result;
goto out;
}
case SHM_LOCK:
case SHM_UNLOCK:
{
shp = shm_lock(shmid);
if(shp==NULL) {
err = -EINVAL;
goto out;
}
err = shm_checkid(shp,shmid);
if(err)
goto out_unlock;


if (!capable(CAP_IPC_LOCK)) {
err = -EPERM;
if (current->euid != shp->shm_perm.uid &&
   current->euid != shp->shm_perm.cuid)
goto out_unlock;
if (cmd == SHM_LOCK &&
   !current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur)
goto out_unlock;
}


err = security_shm_shmctl(shp, cmd);
if (err)
goto out_unlock;

if(cmd==SHM_LOCK) {
struct user_struct * user = current->user;
if (!is_file_hugepages(shp->shm_file)) {
err = shmem_lock(shp->shm_file, 1, user);
if (!err) {
shp->shm_flags |= SHM_LOCKED;
shp->mlock_user = user;
}
}
} else if (!is_file_hugepages(shp->shm_file)) {
shmem_lock(shp->shm_file, 0, shp->mlock_user);
shp->shm_flags &= ~SHM_LOCKED;
shp->mlock_user = NULL;
}
shm_unlock(shp);
goto out;
}
case IPC_RMID:
{
/*
* We cannot simply remove the file. The SVID states
* that the block remains until the last person
* detaches from it, then is deleted. A shmat() on
* an RMID segment is legal in older Linux and if 
* we change it apps break...
*
* Instead we set a destroyed flag, and then blow
* the name away when the usage hits zero.
*/
down(&shm_ids.sem);
shp = shm_lock(shmid);
err = -EINVAL;
if (shp == NULL) 
goto out_up;
err = shm_checkid(shp, shmid);
if(err)
goto out_unlock_up;


if (current->euid != shp->shm_perm.uid &&
   current->euid != shp->shm_perm.cuid && 
   !capable(CAP_SYS_ADMIN)) {
err=-EPERM;
goto out_unlock_up;
}


err = security_shm_shmctl(shp, cmd);
if (err)
goto out_unlock_up;


if (shp->shm_nattch){
shp->shm_flags |= SHM_DEST;
/* Do not find it any more */
shp->shm_perm.key = IPC_PRIVATE;
shm_unlock(shp);
} else
shm_destroy (shp);
up(&shm_ids.sem);
goto out;
}


case IPC_SET:
{
if (copy_shmid_from_user (&setbuf, buf, version)) {
err = -EFAULT;
goto out;
}
if ((err = audit_ipc_perms(0, setbuf.uid, setbuf.gid, setbuf.mode)))
return err;
down(&shm_ids.sem);
shp = shm_lock(shmid);
err=-EINVAL;
if(shp==NULL)
goto out_up;
err = shm_checkid(shp,shmid);
if(err)
goto out_unlock_up;
err=-EPERM;
if (current->euid != shp->shm_perm.uid &&
   current->euid != shp->shm_perm.cuid && 
   !capable(CAP_SYS_ADMIN)) {
goto out_unlock_up;
}


err = security_shm_shmctl(shp, cmd);
if (err)
goto out_unlock_up;

shp->shm_perm.uid = setbuf.uid;
shp->shm_perm.gid = setbuf.gid;
shp->shm_flags = (shp->shm_flags & ~S_IRWXUGO)
| (setbuf.mode & S_IRWXUGO);
shp->shm_ctim = get_seconds();
break;
}


default:
err = -EINVAL;
goto out;
}


err = 0;
out_unlock_up:
shm_unlock(shp);
out_up:
up(&shm_ids.sem);
goto out;
out_unlock:
shm_unlock(shp);
out:
return err;
}

shmctl函数根据传入的cmd值的不同，进行不同的操作。例如SHM_STAT/IPC_STAT拷贝标示符为shmid的共享内存信息或者IPC信息到缓冲区buf。

先判断标示符为shmid的标示符是否存在，然后根据cmd请求调用shm_buildid函数重新得到该共享内存标示符并返回。接着判断读取权限，将共享内存信息拷贝到临时缓冲区tbuf，调用copy_shmid_to_user将tbuf拷贝到buf。

linux kernel源码剖析 共享内存部分 IPC 虚拟内存映射 VMA shmget shmat shmdt shmctl

猜你喜欢

linux kernel源码剖析共享内存部分 IPC 虚拟内存映射 VMA shmget shmat shmdt shmctl