tensorflow 역할을 컴파일하고 실행 nccl 오류

컴파일 인해 버전이 매우 중요합니다. 패키지 다운로드 오류는 몇 번 이상 실행하면.

bazel 설치 :
bazel 0.16.1

bash bazel-0.16.1-installer-linux-x86_64.sh
export PATH="$PATH:$HOME/bin"

: 설치 제공 TF
1.12.0.zip를 제공하는 TF를

在WORKSPACE中  http_archive(  行前添加:
load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
#编译cpu版本:
bazel build tensorflow_serving/model_servers:tensorflow_model_server
#编译gpu版本:
export TF_NEED_CUDA=1
export TF_CUDA_VERSION=10.0
export TF_CUDNN_VERSION=7

bazel build --config=cuda --copt="-fPIC" tensorflow_serving/model_servers:tensorflow_model_server

如果nccl报错:
export NCCL_HDR_PATH=/usr/local/nccl-2.4/include
export NCCL_INSTALL_PATH=/usr/local/nccl-2.4/lib
export TF_NCCL_VERSION=2.4.2
export LD_LIBRARY_PATH=/usr/local/nccl-2.4/lib:$LD_LIBRARY_PATH

서비스를 실행합니다 :

bazel-bin/tensorflow_serving/model_servers/tensorflow_model_server --port=9000 --rest_api_port=9001 --model_name=test --model_base_path=/mnt/ad_relevance/reimgdata/

테스트 :

curl http://localhost:9001/v1/models/test  # test为模型名称
import sys
import cv2
import time
import numpy as np
import tensorflow as tf
from grpc.beta import implementations

from tensorflow_serving.apis import predict_pb2
from tensorflow_serving.apis import prediction_service_pb2

num_per_request = 300
size = 299
central_fraction = 0.875
HOST = "10.22.151.155"
PORT = 9000
channel = implementations.insecure_channel(HOST, PORT)
stub = prediction_service_pb2.beta_create_PredictionService_stub(channel)

def preprocess(img):
    print(img.shape)
    # img = img.astype(np.float32)
    h, w, c = img.shape
    # img = img * 1. / 255.

    bbox_start_h = int( (h - h * central_fraction) / 2 )
    bbox_start_w = int( (w - w * central_fraction) / 2 )
    bbox_end_h = bbox_start_h + h - bbox_start_h * 2
    bbox_end_w = bbox_start_w + w - bbox_start_w * 2
    img = img[bbox_start_h:bbox_end_h, bbox_start_w:bbox_end_w, :]
    print(img.shape)
    img = cv2.resize(img, (size, size), interpolation=cv2.INTER_LINEAR)
    img = img.astype(np.uint8)
    # img = (img - 0.5) * 2
    print(img.shape)
    # if use_fp:
    #     img = np.array(img).astype(np.float16)
    return img[:,:,::-1]

def get(name, input_ids=0, input_mask=0, segment_ids=0, pos_input_ids=0, pos_input_mask=0,
        pos_segment_ids=0, pos_image=0.0 ):
    request = predict_pb2.PredictRequest()
    request.model_spec.name = 'test'
    request.model_spec.signature_name = name
    # request.inputs['pos_input_ids'].CopyFrom(tf.contrib.util.make_tensor_proto(pos_input_ids))
    # request.inputs['pos_input_mask'].CopyFrom(tf.contrib.util.make_tensor_proto(pos_input_mask))
    # request.inputs['pos_segment_ids'].CopyFrom(tf.contrib.util.make_tensor_proto(pos_segment_ids))
    if name=="serving_default":
        request.inputs['input_ids'].CopyFrom(tf.contrib.util.make_tensor_proto(input_ids))
        request.inputs['input_mask'].CopyFrom(tf.contrib.util.make_tensor_proto(input_mask))
        request.inputs['segment_ids'].CopyFrom(tf.contrib.util.make_tensor_proto(segment_ids))
    else:
        request.inputs['image_list'].CopyFrom(tf.contrib.util.make_tensor_proto(pos_image))
    result = stub.Predict(request, 1200)
    #print(result.outputs["image_emb"].float_val)
    return result

def loop_request(func, **args):
    global total
    start = time.time()
    error = 0
    for i in range(total):
        print(i)
        try:
            func(**args)
        except Exception as e:
            print(e)
            error += 1
            continue
    end = time.time()
    print("time_rate: ", (end-start)/total, "error_rate: ", error/total)

def test_cnn():
    #pos_image = np.full((1,299,299,3), 128).astype(np.float32)
    pos_image = cv2.imread("/data00/home/huangqingkang/imgs/1aacb000b0d93c755f995")
    pos_image = preprocess(pos_image)
    pos_image = np.expand_dims(pos_image, 0)
    pos_image = np.tile(pos_image, [num_per_request, 1, 1, 1])
            
    loop_request(get, name="serving_cnn", pos_image=pos_image)

def test_bert():
    # pos_input_ids = np.full((num_per_request,32), 0).astype(np.int32)
    # pos_input_mask = np.full((num_per_request,32), 0).astype(np.int32)
    # pos_segment_ids = np.full((num_per_request,32), 0).astype(np.int32)
    input_ids = np.full((num_per_request+1,32), 0).astype(np.int32)
    input_mask = np.full((num_per_request+1,32), 0).astype(np.int32)
    segment_ids = np.full((num_per_request+1,32), 0).astype(np.int32)
    pos_image = np.array([0], dtype=np.uint8)
    # if use_fp:
    #     pos_image = np.array(pos_image).astype(np.float16)

    loop_request(get, name="serving_default", input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, 
            pos_image = pos_image)
    # pos_input_ids = pos_input_ids, pos_input_mask = pos_input_mask, pos_segment_ids = pos_segment_ids,

def get_cnn_emb(pos_image):
    result = get(name="serving_cnn", pos_image=pos_image)
    return result.output["image_emb"].float_val

# def get_bert_emb(**args):



if __name__ == "__main__":
    total = int(sys.argv[2])
    if int(sys.argv[3]):
        num_per_request = int(sys.argv[3])
    use_fp = int(sys.argv[4])
    if sys.argv[1] == "cnn":
        test_cnn()
    elif sys.argv[1] == "bert":
        test_bert()

기타 :
TensorFlow 서빙 설치 안내서 및 사용 (2)
tf_serving- 모델 훈련, 수출, 배포 (해결)
tf43 : tensorflow 서빙 GRPC 배포 예
tensorflow tfserving 배포 모델의 다른 버전을 사용하여 여러 모델

게시 된 557 개 원래 기사 · 원의 찬양 (500) · 조회수 1,530,000 +

추천

출처blog.csdn.net/qq_16234613/article/details/97900949