<!doctype html><html lang="zh-cmn-Hans-CN"><head><meta charset="utf-8"/><!--[if IE]><link rel="shortcut icon" href="https://paddlepaddle-org-cn.cdn.bcebos.com/paddle-site-front/favicon.ico"><![endif]--><link rel="icon" href="https://paddlepaddle-org-cn.cdn.bcebos.com/paddle-site-front/favicon-48.png" sizes="48x48"><link rel="icon" href="https://paddlepaddle-org-cn.cdn.bcebos.com/paddle-site-front/favicon-64.png" sizes="64x64"><link rel="icon" href="https://paddlepaddle-org-cn.cdn.bcebos.com/paddle-site-front/favicon-128.png" sizes="128x128"><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#fff"/><link rel="manifest" href="https://paddlepaddle-org-cn.cdn.bcebos.com/paddle-site-front/manifest.json"/><title>飞桨PaddlePaddle-源于产业实践的开源深度学习平台</title><meta name="keywords" content="开源深度学习平台,PaddlePaddle Fluid,飞桨PaddlePaddle官网,飞桨PaddlePaddle教程,飞桨PaddlePaddle框架,飞桨PaddlePaddle使用,飞桨PaddlePaddle book,飞桨,百度飞桨,PaddlePaddle"/><meta name="description" content="飞桨致力于让深度学习技术的创新与应用更简单。具有以下特点:同时支持动态图和静态图,兼顾灵活性和效率;精选应用效果最佳算法模型并提供官方支持;真正源于产业实践,提供业界最强的超大规模并行深度学习能力;推理引擎一体化设计,提供训练到多端推理的无缝对接;唯一提供系统化技术服务与支持的深度学习平台"/><script>if(void 0===_historyPolyfill){var _historyPolyfill=function(t){var n=history[t];function e(t){var n;return"function"==typeof Event?n=new Event(t):(n=document.createEvent("Event")).initEvent(t,!0,!0),n}return function(){var i=n.apply(this,arguments),o=t.toLowerCase(),r="on"+o,a=e(o);return a.arguments=arguments,window.dispatchEvent(a),"function"==typeof window[r]&&window[r].apply(this,arguments),i}};history.pushState=_historyPolyfill("pushState"),history.replaceState=_historyPolyfill("replaceState")}</script><script>var _hmt=_hmt||{};!function(){var e=function(){_hmt=_hmt||{};var e=document.getElementById("baidu-tj");e&&e.parentNode.removeChild(e);var t=document.createElement("script");t.id="baidu-tj",t.src="https://hm.baidu.com/hm.js?89be97848720f62fa00a07b1e0d83ae6";var n=document.getElementsByTagName("script")[0];n.parentNode.insertBefore(t,n),setTimeout((function(){"function"!=typeof _hmt.push&&(_hmt=void 0)}),0)};e(),window.addEventListener("popstate",e),window.addEventListener("pushstate",e),window.addEventListener("replacestate",e)}()</script><script>!function(){var t=function(){var t=document.getElementById("baidu-push");t&&t.parentNode.removeChild(t);var e=document.createElement("script");e.id="baidu-push";var n=window.location.protocol.split(":")[0];e.src="https"===n?"https://zz.bdstatic.com/linksubmit/push.js":"http://push.zhanzhang.baidu.com/push.js";var a=document.getElementsByTagName("script")[0];a.parentNode.insertBefore(e,a)};t(),window.addEventListener("popstate",t),window.addEventListener("pushstate",t),window.addEventListener("replacestate",t)}()</script><style>.paddle-s-e-o{position:absolute;overflow:hidden;height:0;width:0}</style><link href="https://paddlepaddle-org-cn.cdn.bcebos.com/paddle-site-front/static/css/0.6d26e789.chunk.css" rel="stylesheet"><link href="https://paddlepaddle-org-cn.cdn.bcebos.com/paddle-site-front/static/css/1.5ef1cb33.chunk.css" rel="stylesheet"><link href="https://paddlepaddle-org-cn.cdn.bcebos.com/paddle-site-front/static/css/document.d8b47223.chunk.css" rel="stylesheet"></head><body class="wy-body-for-nav"><style>.paddle-logo{position:absolute;overflow:hidden;height:0;width:0}</style><div class="paddle-logo"><img src="https://paddlepaddle-org-cn.cdn.bcebos.com/paddle-site-front/static/media/paddlelogo.0b483fa7.png"/></div><noscript>You need to enable JavaScript to run this app.</noscript><div style="height:0;overflow:hidden">\u200E</div><div id="root"></div><script>(function(){window.pageData={"navData":[{"title":"开始使用","href":"/start","items":[]},{"title":"特性","href":"/feature","items":[]},{"title":"文档","href":"","items":[{"title":"API","href":"/documentation/docs/zh/1.5/api_cn/index_cn.html?from=paddlenav","items":[]},{"title":"使用指南","href":"/documentation/docs/zh/1.5/user_guides/index_cn.html?from=paddlenav","items":[]}]},{"title":"工具平台","href":"","items":[{"title":"工具","href":"","items":[{"title":"AutoDL","href":"https://github.com/PaddlePaddle/AutoDL/tree/master/AutoDL%20Design"}]}]}],"seo":{"title":"飞桨PaddlePaddle-源于产业实践的开源深度学习平台","keywords":"开源深度学习平台,PaddlePaddle Fluid,飞桨PaddlePaddle官网,飞桨PaddlePaddle教程,飞桨PaddlePaddle框架,飞桨PaddlePaddle使用,飞桨PaddlePaddle book,飞桨,百度飞桨,PaddlePaddle","description":"飞桨致力于让深度学习技术的创新与应用更简单。具有以下特点:同时支持动态图和静态图,兼顾灵活性和效率;精选应用效果最佳算法模型并提供官方支持;真正源于产业实践,提供业界最强的超大规模并行深度学习能力;推理引擎一体化设计,提供训练到多端推理的无缝对接;唯一提供系统化技术服务与支持的深度学习平台"},"pageData":null}})();</script><script>window.docInfo={};</script><script>(function(){window.docInfo.lang="zh";})();</script><script>(function(){window.docInfo.version="1.5";})();</script><div class="paddle-s-e-o"><div class="header"><ul><li><a href="/start">开始使用</a></li><li><a href="/feature">特性</a></li><li>文档<ul><li><a href="/documentation/docs/zh/1.5/api_cn/index_cn.html?from=paddlenav">API</a></li><li><a href="/documentation/docs/zh/1.5/user_guides/index_cn.html?from=paddlenav">使用指南</a></li></ul></li><li>工具平台<ul><li>工具<ul><li><a href="https://github.com/PaddlePaddle/AutoDL/tree/master/AutoDL%20Design">AutoDL</a></li></ul></li></ul></li></ul></div></div><div class="paddle-s-e-o"><div class="docheader"><ul id="doc_version" class="doc-version"><li><a href="/change-version?version=develop">develop</a></li><li><a href="/change-version?version=2.1">2.1</a></li><li><a href="/change-version?version=2.0">2.0</a></li><li><a href="/change-version?version=1.8">1.8</a></li><li><a href="/change-version?version=1.7">1.7</a></li><li><a href="/change-version?version=1.6">1.6</a></li><li class="current"><a href="/change-version?version=1.5">1.5</a></li><li><a href="/change-version?version=1.4">1.4</a></li><li><a href="/change-version?version=1.3">1.3</a></li><li><a href="/change-version?version=1.2">1.2</a></li><li><a href="/change-version?version=1.1">1.1</a></li><li><a href="/change-version?version=1.0">1.0</a></li><li><a href="/change-version?version=0.15.0">0.15.0</a></li><li><a href="/change-version?version=0.14.0">0.14.0</a></li><li><a href="/change-version?version=0.13.0">0.13.0</a></li><li><a href="/change-version?version=0.12.0">0.12.0</a></li><li><a href="/change-version?version=0.11.0">0.11.0</a></li><li><a href="/change-version?version=0.10.0">0.10.0</a></li></ul><ul id="doc_lang" class="doc-lang"><li class="current"><a href="/change-lang?lang=zh">中文(简)</a></li><li><a href="/change-lang?lang=en">English(En)</a></li></ul></div><div id="document_content">使用FleetAPI进行分布式训练
====================

FleetAPI 设计说明
---------------

Fleet是PaddlePaddle Fluid最新优化的多机API版本, 统一了多机API的实现,兼容Transpiler/Collective两种模式。 可以在MPI环境及K8S环境下进行多机训练,以及自定义分布式训练配置。


FleetAPI 接口说明
------------------------------
.. csv-table::
   :header: "接口", "说明"

   "init", "fleet初始化,需要在使用fleet其他接口前先调用,用于定义多机的环境配置"
   "distributed_optimizer", "fleet多机训练策略优化,接收一个标准Optimizer及相应的多机运行策略,fleet会根据优化策略进行优化"
   "init_server", "fleet加载model_dir中保存的模型相关参数进行parameter server的初始化"
   "run_server", "fleet启动parameter server服务"
   "init_worker", "fleet初始化当前worker运行环境"
   "is_worker", "判断当前节点是否是Worker节点,是则返回True,否则返回False"
   "is_server", "判断当前节点是否是Server节点,是则返回True,否则返回False"
   "save_inference_model", "fleet保存预测相关的模型及参数,参数及用法参考 code:`fluid.io.save_inference_model`"
   "save_persistables", "fleet保存多机模型参数,参数及用法参考 code:`fluid.io.save_persistables`"


FleetAPI 一般训练步骤
------------------------------

通过import引入需要使用的模式
++++++++++++++++++

使用parameter server方式的训练:

.. code-block:: python

    from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet


初始化
++++++++++++++++++
Fleet使用 code:`fleet.init(role_maker=None)` 进行初始化

当用户不指定role_maker, 则Fleet默认用户使用MPI环境,会采用MPISymetricRoleMaker.

如果用户使用非MPI环境,则需要通过UserDefinedRoleMaker自行定义执行环境。
例如:

.. code-block:: python

    role = UserDefinedRoleMaker(current_id=0,
                     role=Role.WORKER,
                     worker_num=3,
                     server_endpoints=["127.0.0.1:6001","127.0.0.1:6002"])
    fleet.init(role_maker=role)


分布式策略及多机配置
++++++++++++++++

对于Transpiler模式,需要使用 DistributeTranspilerConfig 指定多机配置。
Fleet需要在用户定义的optimizer之上装饰 code:`fleet.distributed_optimizer` 来完成多机分布式策略的配置。

.. csv-table::
   :header: "接口", "说明"

   "sync_mode", "Fleet可以支持同步训练或异步训练, 默认会生成同步训练的分布式程序,通过指定 :code:`sync_mode=False` 参数即可生成异步训练的程序"
   "split_method", "指定参数在parameter server上的分布方式, 默认使用 `RoundRobin`, 也可选`HashName`"
   "slice_var_up", "指定是否将较大(大于8192个元素)的参数切分到多个parameter server以均衡计算负载,默认为开启"


例如:

.. code-block:: python

    config = DistributeTranspilerConfig()
    config.sync_mode = True
   
    # build network
    # ...
    avg_cost = model()
    
    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
    # 加入 fleet distributed_optimizer 加入分布式策略配置及多机优化
    optimizer = fleet.distributed_optimizer(optimizer, config)
    optimizer.minimize(avg_cost)


具体训练流程
++++++++++++++++

.. code-block:: python

    # 启动server
    if fleet.is_server():
        fleet.init_server()
        fleet.run_server()
 
    # 启动worker
    if fleet.is_worker():
        # 初始化worker配置
        fleet.init_worker()
    
        feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
        train_reader = paddle.batch(fake_reader(), batch_size=24)
    
        exe.run(fleet.startup_program)
    
        PASS_NUM = 10
        for pass_id in range(PASS_NUM):
            for batch_id, data in enumerate(train_reader()):
                avg_loss_value, auc_value, auc_batch_value = \ 
                    exe.run(fleet.main_program, feed=feeder.feed(data), fetch_list=[avg_cost, auc, auc_batch])
                print("Pass %d, cost = %f, auc = %f, batch_auc = %f" % (pass_id, avg_loss_value, auc_value, auc_batch_value))
        # 通知server,当前节点训练结束
        fleet.stop_worker()


</div></div><script>!function(e){function t(t){for(var n,l,p=t[0],a=t[1],f=t[2],c=0,s=[];c<p.length;c++)l=p[c],Object.prototype.hasOwnProperty.call(o,l)&&o[l]&&s.push(o[l][0]),o[l]=0;for(n in a)Object.prototype.hasOwnProperty.call(a,n)&&(e[n]=a[n]);for(i&&i(t);s.length;)s.shift()();return u.push.apply(u,f||[]),r()}function r(){for(var e,t=0;t<u.length;t++){for(var r=u[t],n=!0,p=1;p<r.length;p++){var a=r[p];0!==o[a]&&(n=!1)}n&&(u.splice(t--,1),e=l(l.s=r[0]))}return e}var n={},o={58:0},u=[];function l(t){if(n[t])return n[t].exports;var r=n[t]={i:t,l:!1,exports:{}};return e[t].call(r.exports,r,r.exports,l),r.l=!0,r.exports}l.m=e,l.c=n,l.d=function(e,t,r){l.o(e,t)||Object.defineProperty(e,t,{enumerable:!0,get:r})},l.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},l.t=function(e,t){if(1&t&&(e=l(e)),8&t)return e;if(4&t&&"object"==typeof e&&e&&e.__esModule)return e;var r=Object.create(null);if(l.r(r),Object.defineProperty(r,"default",{enumerable:!0,value:e}),2&t&&"string"!=typeof e)for(var n in e)l.d(r,n,function(t){return e[t]}.bind(null,n));return r},l.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return l.d(t,"a",t),t},l.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},l.p="https://paddlepaddle-org-cn.cdn.bcebos.com/paddle-site-front/";var p=this["webpackJsonppaddle-site-front"]=this["webpackJsonppaddle-site-front"]||[],a=p.push.bind(p);p.push=t,p=p.slice();for(var f=0;f<p.length;f++)t(p[f]);var i=a;r()}([])</script><script src="https://paddlepaddle-org-cn.cdn.bcebos.com/paddle-site-front/static/js/0.381d6006.chunk.js"></script><script src="https://paddlepaddle-org-cn.cdn.bcebos.com/paddle-site-front/static/js/1.7178e4d4.chunk.js"></script><script src="https://paddlepaddle-org-cn.cdn.bcebos.com/paddle-site-front/static/js/document.2d1c02c2.chunk.js"></script></body></html>