ceph分层机制中缓存读写流程源码分析

前言

本文从ceph源码角度详细讲ceph cache tier机制中缓存层读写逻辑和实现过程，源码环境如下：

ceph版本：14.2.22

PrimaryLogPG::do_request

文件路径：ceph/src/osd/PrimaryLogPG.cc
OSD在收到客户端发送的请求时，会调用do_request函数，该函数是PrimaryLogPG类的方法。在该函数中，OSD会针对客户端请求类型执行不同操作。在正常情况下，OSD会调用do_op函数来处理客户端的请求。

void PrimaryLogPG::do_request(OpRequestRef &op, ThreadPool::TPHandle &handle)
{
    
    
    if (op->osd_trace)
    {
    
    
        op->pg_trace.init("pg op", &trace_endpoint, &op->osd_trace);
        op->pg_trace.event("do request");
    }
    // make sure we have a new enough map
    //确保请求队列中的请求op是最新的
    auto p = waiting_for_map.find(op->get_source());
    //如果找到op,需要更新请求队列,如果没有找到,说明当前op就是最新的op,不需要入队
    if (p != waiting_for_map.end())
    {
    
    
        // preserve ordering
        dout(20) << __func__ << " waiting_for_map " << p->first << " not empty, queueing" << dendl;
        p->second.push_back(op);
        op->mark_delayed("waiting_for_map not empty");
        return;
    }
    if (!have_same_or_newer_map(op->min_epoch))
    {
    
    
        dout(20) << __func__ << " min " << op->min_epoch << ", queue on waiting_for_map " << op->get_source() << dendl;
        waiting_for_map[op->get_source()].push_back(op);
        op->mark_delayed("op must wait for map");
        osd->request_osdmap_update(op->min_epoch);
        return;
    }
	//判断当前op是否可以取消,如果是,直接返回
    if (can_discard_request(op))
    {
    
    
        return;
    }

    // pg-wide backoffs
    const Message *m = op->get_req();//构建Messaged对象指针
    int msg_type = m->get_type();//获取消息类型

    if (m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF))
    {
    
    
        SessionRef session{
    
    static_cast<Session *>(m->get_connection()->get_priv().get())};
        if (!session)
            return; // drop it.

        if (msg_type == CEPH_MSG_OSD_OP)
        {
    
    
            if (session->check_backoff(cct, info.pgid, info.pgid.pgid.get_hobj_start(), m))
            {
    
    
                return;
            }

            bool backoff = is_down() || is_incomplete() || (!is_active() && is_peered());
            if (g_conf()->osd_backoff_on_peering && !backoff)
            {
    
    
                if (is_peering())
                {
    
    
                    backoff = true;
                }
            }
            if (backoff)
            {
    
    
                add_pg_backoff(session);
                return;
            }
        }
        // pg backoff acks at pg-level
        if (msg_type == CEPH_MSG_OSD_BACKOFF)
        {
    
    
            const MOSDBackoff *ba = static_cast<const MOSDBackoff *>(m);
            if (ba->begin != ba->end)
            {
    
    
                handle_backoff(op);
                return;
            }
        }
    }

	//检查PG的状态,如果当前PG处于peering,需要等待
    if (!is_peered())
    {
    
    
        // Delay unless PGBackend says it's ok
        //判断PG后端在inactive状态下是否可以处理op,如果可以,就调用pgbackend->handle_message处理
        if (pgbackend->can_handle_while_inactive(op))
        {
    
    
            bool handled = pgbackend->handle_message(op);
            ceph_assert(handled);
            return;
        }
        else
        {
    
    //如果不可以,需要将当前op挂到waiting_for_peered队列中
            waiting_for_peered.push_back(op);
            op->mark_delayed("waiting for peered");
            return;
        }
    }

	//如果正在flush,需要将当前op挂到waiting_for_flush队列中
    if (flushes_in_progress > 0)
    {
    
    
        dout(20) << flushes_in_progress << " flushes_in_progress pending "
                 << "waiting for flush on " << op << dendl;
        waiting_for_flush.push_back(op);
        op->mark_delayed("waiting for flush");
        return;
    }

    ceph_assert(is_peered() && flushes_in_progress == 0);

	//如果PG后端在处理当前op,直接返回
    if (pgbackend->handle_message(op))
        return;

	//如果PG后端没有处理当前op,根据消息类型,执行相关操作
    switch (msg_type)
    {
    
    
    case CEPH_MSG_OSD_OP:
    case CEPH_MSG_OSD_BACKOFF:
        if (!is_active())
        {
    
    
        	//如果当前PG处于非active状态,需要挂起op到waiting_for_active队列中
            dout(20) << " peered, not active, waiting for active on " << op << dendl;
            waiting_for_active.push_back(op);
            op->mark_delayed("waiting for active");
            return;
        }
		//如果当前PG处于正常状态
        switch (msg_type)
        {
    
    
        case CEPH_MSG_OSD_OP:
            // verify client features
            if ((pool.info.has_tiers() || pool.info.is_tier()) && !op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL))
            {
    
    
            	//如果是关于分层存储的请求，直接返回
                osd->reply_op_error(op, -EOPNOTSUPP);
                return;
            }
            do_op(op);//处理当前op
            break;
        case CEPH_MSG_OSD_BACKOFF:
            // object-level backoff acks handled in osdop context
            handle_backoff(op);
            break;
        }
        break;

    case MSG_OSD_PG_SCAN:
        do_scan(op, handle);
        break;

    case MSG_OSD_PG_BACKFILL:
        do_backfill(op);
        break;

    case MSG_OSD_PG_BACKFILL_REMOVE:
        do_backfill_remove(op);
        break;

    case MSG_OSD_SCRUB_RESERVE: {
    
    
        const MOSDScrubReserve *m = static_cast<const MOSDScrubReserve *>(op->get_req());
        switch (m->type)
        {
    
    
        case MOSDScrubReserve::REQUEST:
            handle_scrub_reserve_request(op);
            break;
        case MOSDScrubReserve::GRANT:
            handle_scrub_reserve_grant(op, m->from);
            break;
        case MOSDScrubReserve::REJECT:
            handle_scrub_reserve_reject(op, m->from);
            break;
        case MOSDScrubReserve::RELEASE:
            handle_scrub_reserve_release(op);
            break;
        }
    }
    break;

    case MSG_OSD_REP_SCRUB:
        replica_scrub(op, handle);
        break;

    case MSG_OSD_REP_SCRUBMAP:
        do_replica_scrub_map(op);
        break;

    case MSG_OSD_PG_UPDATE_LOG_MISSING:
        do_update_log_missing(op);
        break;

    case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
        do_update_log_missing_reply(op);
        break;

    default:
        ceph_abort_msg("bad message type in do_request");
    }
}

PrimaryLogPG::do_op

文件路径：ceph/src/osd/PrimaryLogPG.cc
do_op函数是PrimaryLogPG类的方法，该函数是OSD处理请求的通用函数。在处理cache tier时，主要通过调用agent_choose_mode和maybe_handle_cache 函数。其中agent_choose_mode是用来计算cache pool的flush mode和evict mode，并触发cache pool的flush和evict操作。maybe_handle_cache是用来处理代理读写以及是否要将读写的数据promote到cache pool中。

void PrimaryLogPG::do_op(OpRequestRef &op)
{
    
    
...
...
    //hit set相关设置
    bool in_hit_set = false;
    if (hit_set)
    {
    
    
        if (obc.get())
        {
    
    
            if (obc->obs.oi.soid != hobject_t() && hit_set->contains(obc->obs.oi.soid))
                in_hit_set = true;
        }
        else
        {
    
    
            if (missing_oid != hobject_t() && hit_set->contains(missing_oid))
                in_hit_set = true;
        }
        if (!op->hitset_inserted)
        {
    
    
            hit_set->insert(oid);
            op->hitset_inserted = true;
            if (hit_set->is_full() || hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp())
            {
    
    
                hit_set_persist();
            }
        }
    }

    //cache tier agent相关处理
    if (agent_state)
    {
    
    
        if (agent_choose_mode(false, op))
            return;
    }

    if (obc.get() && obc->obs.exists && obc->obs.oi.has_manifest())
    {
    
    
        if (maybe_handle_manifest(op, write_ordered, obc))
            return;
    }

    //cache tier 处理,如果cache pool 命中 object,则无需处理
    //如果 cache pool 没有命中，则需要根据 cache tier 策略读写数据
    if (maybe_handle_cache(op, write_ordered, obc, r, missing_oid, false, in_hit_set))
        return;

    //cache tier 处理结果
    if (r && (r != -ENOENT || !obc))
    {
    
    
        // copy the reqids for copy get on ENOENT
        if (r == -ENOENT && (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET))
        {
    
    
            fill_in_copy_get_noent(op, oid, m->ops[0]);
            return;
        }
        dout(20) << __func__ << ": find_object_context got error " << r << dendl;
        if (op->may_write() && get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN)
        {
    
    
            record_write_error(op, oid, nullptr, r);
        }
        else
        {
    
    
            osd->reply_op_error(op, r);
        }
        return;
    }
...
...
}

PrimaryLogPG::maybe_handle_cache

文件路径：ceph/src/osd/PrimaryLogPG.cc
maybe_handle_cache是PrimaryLogPG类的方法，实际上该函数并不是处理缓存读写数据的真正函数，它只是一个接口。它通过调用maybe_handle_cache_detail函数来实现具体的操作。

    bool maybe_handle_cache(OpRequestRef op, bool write_ordered, ObjectContextRef obc, int r,
                            const hobject_t &missing_oid, bool must_promote, bool in_hit_set = false)
    {
    
    
        // 如果返回值是cache_result_t::NOOP,说明对 cache tier 没有任何处理
        return cache_result_t::NOOP !=
               maybe_handle_cache_detail(op, write_ordered, obc, r, missing_oid, must_promote, in_hit_set, nullptr);
    }

PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_cache_detail

文件路径：ceph/src/osd/PrimaryLogPG.cc
maybe_handle_cache_detail是PrimaryLogPG类的方法，在该函数是处理cache tier的真正入口函数。在该函数中，会提前检测要读写的object是否在当前OSD的本地中存在，如果有，就不需要对cache tier做任何操作。如果没有，则会根据cache tier策略执行代理读写。同时也会根据promote策略决定是否需要将读写的数据promote到cache pool中。
注：我在调试代码的时候，发现writeback模式下的新写流程和官网说的不一致。官网在描述writeback模式下，数据先写入到cache pool中，然后由cache pool中flush到base pool中。实际上调试和读源码发现，源码是先将数据写入到base pool中，然后在由base pool中promote到cache pool中，与官网说的截然相反。

PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_cache_detail(OpRequestRef op, bool write_ordered,
                                                                     ObjectContextRef obc, int r, hobject_t missing_oid,
                                                                     bool must_promote, bool in_hit_set,
                                                                     ObjectContextRef *promote_obc)
{
    
    
    //检测请求是否需要从 cache pool 中读取数据
    if (op && op->get_req() && op->get_req()->get_type() == CEPH_MSG_OSD_OP &&
        (static_cast<const MOSDOp *>(op->get_req())->get_flags() & CEPH_OSD_FLAG_IGNORE_CACHE))
    {
    
    
        dout(20) << __func__ << ": ignoring cache due to flag" << dendl;
        return cache_result_t::NOOP;
    }

	//函数走到这里，说明请求中需要从 cache pool 中读写数据，下一步需要检查有没有设置 cache tier
	
    //检测是否根据 cache tier 有没有正确设置
    //这里的pool不分data pool还是cache pool,因为osd可能在data pool,也可能在cache pool
    // return quickly if caching is not enabled
    if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)
        return cache_result_t::NOOP;

    // promote标志
    must_promote = must_promote || op->need_promote();

    if (obc)
        dout(25) << __func__ << " " << obc->obs.oi << " " << (obc->obs.exists ? "exists" : "DNE") << " missing_oid "
                 << missing_oid << " must_promote " << (int)must_promote << " in_hit_set " << (int)in_hit_set << dendl;
    else
        dout(25) << __func__ << " (no obc)"
                 << " missing_oid " << missing_oid << " must_promote " << (int)must_promote << " in_hit_set "
                 << (int)in_hit_set << dendl;

	// 函数执行到这里，说明当前 OSD 一定是 cache pool 中的 OSD,此时的 pool 一定是 cache pool

    // 根据 object 上下文检测当前 object 是否处于 blocked 状态,如果是,则需要等待
    // if it is write-ordered and blocked, stop now
    if (obc.get() && obc->is_blocked() && write_ordered)
    {
    
    
        // we're already doing something with this object
        dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
        return cache_result_t::NOOP;
    }

    //检测当前object是否被标记为从集群中删除,如果是,就不能对当前object操作
    //这里的删除指的是逻辑上的删除,真正的删除操作会由定时任务去完成
    if (r == -ENOENT && missing_oid == hobject_t())
    {
    
    
        // we know this object is logically absent (e.g., an undefined clone)
        return cache_result_t::NOOP;
    }

    //检测当前 object 是否在已经在当前 OSD 本地中存在，也就是是否在 cache pool 中
    //如果已经命中,就不需要对 cache tier 做任何处理
    if (obc.get() && obc->obs.exists)
    {
    
    
        osd->logger->inc(l_osd_op_cache_hit);
        return cache_result_t::NOOP;
    }

    //如果当前 OSD 不是 primary OSD,则需要将请求转发给 primary OSD 处理
    //从另外一方面可以证明此时的 object 很有可能是副本
    if (!is_primary())
    {
    
    
        dout(20) << __func__ << " cache miss; ask the primary" << dendl;
        osd->reply_op_error(op, -EAGAIN);//将错误码 `-EAGAIN` 作为响应返回给客户端
        return cache_result_t::REPLIED_WITH_EAGAIN;
    }

    //如果当前 object 被标记为从集群中删除,但是当前 object 的上下文依然存在
    //说明当前 object 实际上并未真正的从集群中删除掉
    //对于这种状态的 object,依然是可以读的
    if (missing_oid == hobject_t() && obc.get())
    {
    
    
        missing_oid = obc->obs.oi.soid;
    }

    //走到这里,说明当前 OSD(cache pool) 中确实不存在当前 object

    const MOSDOp *m = static_cast<const MOSDOp *>(op->get_req());

    //获取当前 object 在集群中的存储位置相关信息
    const object_locator_t oloc = m->get_object_locator();

    //检测请求是不是需要跳过处理缓存,如果需要跳过,就不需要处理缓存
    if (op->need_skip_handle_cache())
    {
    
    
        return cache_result_t::NOOP;
    }

    OpRequestRef promote_op;

    //下面是处理 cache tier 的逻辑,如果设置了 cache tier, 将根据 cache tier 策略读取数据
    switch (pool.info.cache_mode)
    {
    
    
    //writeback模式
    case pg_pool_t::CACHEMODE_WRITEBACK:
        //如果 cache pool 已经处理 full 状态,此时的请求只能读
        if (agent_state && agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL)
        {
    
    
            //如果当前请求不是write,也不是将当前请求缓存起来,说明当前请求可能是read
            //如果当前请求是read,则执行代理读
            if (!op->may_write() && !op->may_cache() && !write_ordered && !must_promote)
            {
    
    
                dout(20) << __func__ << " cache pool full, proxying read" << dendl;
                do_proxy_read(op);//代理读
                return cache_result_t::HANDLED_PROXY;
            }

            //走到这里,说明当前请求不是read,可能是write,需要将当前请求加入请求等待队列中
            dout(20) << __func__ << " cache pool full, waiting" << dendl;
            block_write_on_full_cache(missing_oid, op);
            return cache_result_t::BLOCKED_FULL;
        }

        //走到这里,说明cache pool不是处于full状态,可以执行write或者read

        //如果当前object需要promote到当前 cache pool 中
        //或者当前osd内存缓存中没有命中当前object
        //需要将当前object promote到 cache pool,并阻塞当前请求,直到当前object被promote到 cache pool中
        if (must_promote || (!hit_set && !op->need_skip_promote()))
        {
    
    
            promote_object(obc, missing_oid, oloc, op, promote_obc);
            return cache_result_t::BLOCKED_PROMOTE;
        }

        //走到这里,说明没有执行promote操作

        // 处理write请求
        if (op->may_write() || op->may_cache())
        {
    
    
            // 执行代理写
            do_proxy_write(op);

            // Promote too?
            //是否要将写入的数据 promote 到 cache pool 中
            if (!op->need_skip_promote() &&
                maybe_promote(obc, missing_oid, oloc, in_hit_set, pool.info.min_write_recency_for_promote,
                              OpRequestRef(), promote_obc))
            {
    
    
                return cache_result_t::BLOCKED_PROMOTE;
            }
            return cache_result_t::HANDLED_PROXY;
        }
        // 处理read请求
        else
        {
    
    
            //执行代理读
            do_proxy_read(op);

            // Avoid duplicate promotion
            if (obc.get() && obc->is_blocked())
            {
    
    
                if (promote_obc)
                    *promote_obc = obc;
                return cache_result_t::BLOCKED_PROMOTE;
            }

            // Promote too?
            if (!op->need_skip_promote())
            {
    
    
                (void)maybe_promote(obc, missing_oid, oloc, in_hit_set, pool.info.min_read_recency_for_promote,
                                    promote_op, promote_obc);
            }

            return cache_result_t::HANDLED_PROXY;
        }

        //走到这里,说明没有处理任何请求
        ceph_abort_msg("unreachable");
        return cache_result_t::NOOP;

    //forward模式
    case pg_pool_t::CACHEMODE_FORWARD:
        // FIXME: this mode allows requests to be reordered.
        do_cache_redirect(op);
        return cache_result_t::HANDLED_REDIRECT;

    //readonly模式
    case pg_pool_t::CACHEMODE_READONLY:
        // TODO: clean this case up
        if (!obc.get() && r == -ENOENT)
        {
    
    
            // we don't have the object and op's a read
            promote_object(obc, missing_oid, oloc, op, promote_obc);
            return cache_result_t::BLOCKED_PROMOTE;
        }
        if (!r)
        {
    
     // it must be a write
            do_cache_redirect(op);
            return cache_result_t::HANDLED_REDIRECT;
        }
        // crap, there was a failure of some kind
        return cache_result_t::NOOP;

    //readforward模式
    case pg_pool_t::CACHEMODE_READFORWARD:
        // Do writeback to the cache tier for writes
        if (op->may_write() || write_ordered || must_promote)
        {
    
    
            if (agent_state && agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL)
            {
    
    
                dout(20) << __func__ << " cache pool full, waiting" << dendl;
                block_write_on_full_cache(missing_oid, op);
                return cache_result_t::BLOCKED_FULL;
            }
            promote_object(obc, missing_oid, oloc, op, promote_obc);
            return cache_result_t::BLOCKED_PROMOTE;
        }

        // If it is a read, we can read, we need to forward it
        do_cache_redirect(op);
        return cache_result_t::HANDLED_REDIRECT;

    //proxy模式
    case pg_pool_t::CACHEMODE_PROXY:
        if (!must_promote)
        {
    
    
            if (op->may_write() || op->may_cache() || write_ordered)
            {
    
    
                do_proxy_write(op);
                return cache_result_t::HANDLED_PROXY;
            }
            else
            {
    
    
                do_proxy_read(op);
                return cache_result_t::HANDLED_PROXY;
            }
        }
        // ugh, we're forced to promote.
        if (agent_state && agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL)
        {
    
    
            dout(20) << __func__ << " cache pool full, waiting" << dendl;
            block_write_on_full_cache(missing_oid, op);
            return cache_result_t::BLOCKED_FULL;
        }
        promote_object(obc, missing_oid, oloc, op, promote_obc);
        return cache_result_t::BLOCKED_PROMOTE;

    //readproxy模式
    case pg_pool_t::CACHEMODE_READPROXY:
        // Do writeback to the cache tier for writes
        if (op->may_write() || write_ordered || must_promote)
        {
    
    
            if (agent_state && agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL)
            {
    
    
                dout(20) << __func__ << " cache pool full, waiting" << dendl;
                block_write_on_full_cache(missing_oid, op);
                return cache_result_t::BLOCKED_FULL;
            }
            promote_object(obc, missing_oid, oloc, op, promote_obc);
            return cache_result_t::BLOCKED_PROMOTE;
        }

        // If it is a read, we can read, we need to proxy it
        do_proxy_read(op);
        return cache_result_t::HANDLED_PROXY;

    default:
        ceph_abort_msg("unrecognized cache_mode");
    }

    // 走到这里, 说明cache tier模式没有正确设置,导致对cache tier没有任何操作处理
    return cache_result_t::NOOP;
}

PrimaryLogPG::promote_object

文件路径： ceph/src/osd/PrimaryLogPG.cc

void PrimaryLogPG::promote_object(ObjectContextRef obc, const hobject_t &missing_oid, const object_locator_t &oloc,
                                  OpRequestRef op, ObjectContextRef *promote_obc)
{
    
    
    hobject_t hoid = obc ? obc->obs.oi.soid : missing_oid;
    ceph_assert(hoid != hobject_t());

    //Scrub是Ceph中的一项数据完整性检查工作,会检查存储池中所有对象的数据是否正确.
    //如果某个对象正在进行Scrub操作,那么对该对象的读写请求就会被阻塞,直到Scrub完成为止.
    //检查对象是否被Scrub阻塞,如果是,则将操作请求放入等待队列中,等待Scrub完成后再处理.
    if (write_blocked_by_scrub(hoid))
    {
    
    
        dout(10) << __func__ << " " << hoid << " blocked by scrub" << dendl;
        if (op)
        {
    
    
            waiting_for_scrub.push_back(op);//将操作请求放入等待队列中
            op->mark_delayed("waiting for scrub");
            dout(10) << __func__ << " " << hoid << " placing op in waiting_for_scrub" << dendl;
        }
        else
        {
    
    
            dout(10) << __func__ << " " << hoid << " no op, dropping on the floor" << dendl;
        }
        return;
    }

    //创建对象上下文,该操作是针对 cache pool 中不存在的对象
    if (!obc)
    {
    
     // we need to create an ObjectContext
        ceph_assert(missing_oid != hobject_t());
        obc = get_object_context(missing_oid, true);
    }

    // 是否需要promote obc
    if (promote_obc)
        *promote_obc = obc;

    /*
     * Before promote complete, if there are  proxy-reads for the object,
     * for this case we don't use DONTNEED.
     */
    //设置对象数据的读取策略
    unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
    map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(obc->obs.oi.soid);
    if (q == in_progress_proxy_ops.end())
    {
    
    
        src_fadvise_flags |= LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
    }

    CopyCallback *cb;
    object_locator_t my_oloc;
    hobject_t src_hoid;

    //下面是创建promote结束后的回调函数
    //回调函数在start_copy函数执行完成后被调用
    //如果对象全部位于同一个OSD上
    if (!obc->obs.oi.has_manifest())
    {
    
    
        my_oloc = oloc;
        my_oloc.pool = pool.info.tier_of;//获取 base pool
        src_hoid = obc->obs.oi.soid;//表示复制操作的源对象ID
        cb = new PromoteCallback(obc, this);
    }
    else
    {
    
    //如果对象被分解成多块,分散到其他OSD中
        if (obc->obs.oi.manifest.is_chunked())
        {
    
    
            src_hoid = obc->obs.oi.soid;
            cb = new PromoteManifestCallback(obc, this);
        }
        else if (obc->obs.oi.manifest.is_redirect())
        {
    
    
            object_locator_t src_oloc(obc->obs.oi.manifest.redirect_target);
            my_oloc = src_oloc;
            src_hoid = obc->obs.oi.manifest.redirect_target;
            cb = new PromoteCallback(obc, this);
        }
        else
        {
    
    
            ceph_abort_msg("unrecognized manifest type");
        }
    }

    unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY | CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
                     CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE | CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
    //执行复制操作
    start_copy(cb, obc, src_hoid, my_oloc, 0, flags, obc->obs.oi.soid.snap == CEPH_NOSNAP, src_fadvise_flags, 0);

    //确保对象已经被blocked,防止被其他操作使用
    ceph_assert(obc->is_blocked());
    //在 promote_object() 函数中,需要对对象进行数据复制和版本更新等操作,这些操作可能会影响对象的状态.
    //为了避免在操作期间出现未预期的错误,需要将 obc 对象标记为 blocked 状态,以防止其他操作同时修改对象
    //如果对象已经被标记为 blocked 状态,则函数立即返回:否则,函数会一直等待
    if (op)
        wait_for_blocked_object(obc->obs.oi.soid, op);

    info.stats.stats.sum.num_promote++;
}

由于时间问题，文档还在努力的完善中…

前言

PrimaryLogPG::do_request

PrimaryLogPG::do_op

PrimaryLogPG::maybe_handle_cache

PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_cache_detail

PrimaryLogPG::promote_object

猜你喜欢

目录

热门文章