{"id":6656,"date":"2018-08-01T10:24:56","date_gmt":"2018-08-01T02:24:56","guid":{"rendered":"https:\/\/kyle.ai\/blog\/?p=6656"},"modified":"2019-06-12T10:33:47","modified_gmt":"2019-06-12T02:33:47","slug":"spark%e5%bc%80%e5%8f%91%e9%83%a8%e7%bd%b2%e9%81%87%e5%88%b0%e7%9a%84%e9%97%ae%e9%a2%98%e6%b1%87%e6%80%bb","status":"publish","type":"post","link":"https:\/\/kyle.ai\/blog\/6656.html","title":{"rendered":"Spark\u5f00\u53d1\u90e8\u7f72\u9047\u5230\u7684\u95ee\u9898\u6c47\u603b"},"content":{"rendered":"<h1>\u5982\u4f55\u8ba9pyspark\u4f7f\u7528ipython<\/h1>\n<p>vim .\/bin\/pyspark \uff0c\u5728\u6700\u4e0a\u9762\u52a0\u4e00\u884c<\/p>\n<pre class=\"brush: python; title: ; notranslate\" title=\"\">\r\nexport PYSPARK_DRIVER_PYTHON=ipython\r\n<\/pre>\n<h1>\u5982\u4f55\u5728pyspark\u4e2d\u8fde\u63a5cassandra<\/h1>\n<p>\u5728\u542f\u52a8pyspark\u65f6\uff0c\u6dfb\u52a0 &#8211;packages \u53c2\u6570<\/p>\n<pre class=\"brush: bash; title: ; notranslate\" title=\"\">\r\n.\/bin\/pyspark  --packages com.datastax.spark:spark-cassandra-connector_2.11:2.3.0\r\n<\/pre>\n<p>\u7136\u540e\u5199python\u4ee3\u7801\u4ececassandra\u4e2d\u8bfb\u53d6\u6570\u636e\uff1a<\/p>\n<pre class=\"brush: python; title: ; notranslate\" title=\"\">\r\nspark.conf.set(&quot;spark.cassandra.connection.host&quot;, &quot;172.31.0.21&quot;)\r\ndata_frame = spark.read.format(&quot;org.apache.spark.sql.cassandra&quot;)\\\r\n            .options(table=&quot;service_name_index&quot;, keyspace=&quot;jaeger&quot;).load()\r\ndata_frame.filter(&quot;start_time&gt;1529459493693772&quot;).show()\r\ndata_frame.rdd.map(lambda r: (r.service_name, 1))\\\r\n                 .reduceByKey(lambda x, y: x+y).collect()\r\n<\/pre>\n<h1>\u5982\u4f55\u6253\u5f00pyspark\u5373\u542f\u7528jupyter notebook<\/h1>\n<p>vim ~\/.bash_profile\uff0c\u6dfb\u52a0\u73af\u5883\u53d8\u91cf<\/p>\n<pre class=\"brush: bash; title: ; notranslate\" title=\"\">\r\nexport SPARK_HOME=&quot;\/home\/kyle\/spark-2.3.0-bin-hadoop2.7&quot;\r\nexport PYSPARK_DRIVER_PYTHON=jupyter \r\nexport PYSPARK_DRIVER_PYTHON_OPTS=&quot;notebook&quot;\r\n<\/pre>\n<p>\u6216\u8005\uff0c\u5728\u542f\u52a8pypark\u7684\u547d\u4ee4\u884c\u4e2d\u8bbe\u7f6e\u73af\u5883\u53d8\u91cf<\/p>\n<pre class=\"brush: bash; title: ; notranslate\" title=\"\">\r\nPYSPARK_DRIVER_PYTHON=jupyter PYSPARK_DRIVER_PYTHON_OPTS=&quot;notebook&quot; .\/bin\/pyspark \\\r\n--packages com.datastax.spark:spark-cassandra-connector_2.11:2.3.0\r\n<\/pre>\n<h1>\u8282\u70b9\u8d44\u6e90\u4e0d\u8db3\u62a5\u9519<\/h1>\n<p>\u5982\u679c\u51fa\u73b0\u8fd9\u6837\u7684\u62a5\u8b66\u65e5\u5fd7\uff0c\u5219\u6709\u53ef\u80fd\u662fworker\u8282\u70b9\u7684cpu\u3001\u5185\u5b58\u7b49\u8d44\u6e90\u4e0d\u591f<\/p>\n<pre class=\"brush: plain; title: ; notranslate\" title=\"\">\r\nInitial job has not accepted any resources; \r\ncheck your cluster UI to ensure that workers are registered \r\nand have sufficient resources\r\n<\/pre>\n<p>\u89e3\u51b3\u529e\u6cd5\u662f\uff0c\u53ef\u4ee5\u7ed9worker\u66f4\u591a\u7684\u5185\u5b58\uff0c\u5982\u679c\u5b9e\u5728\u6ca1\u6709\u5185\u5b58\u53ef\u7528\uff0c\u90a3\u4e48\u53ef\u4ee5\u8ba9spark\u9650\u5236\u4e00\u4e0b\u5185\u5b58\u7684\u4f7f\u7528\uff0c\u5728spark-submit\u7684\u65f6\u5019\u6dfb\u52a0\u53c2\u6570\uff1a<\/p>\n<pre class=\"brush: bash; title: ; notranslate\" title=\"\">\r\n~\/spark-2.3.0-bin-hadoop2.7\/bin\/spark-submit --master spark:\/\/172.31.0.53:7077 \\\r\n--executor-memory 512M  \\\r\n--packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.3.0 \\\r\nspark_job_kafka_streaming.py\r\n<\/pre>\n<h1>driver\u4e0eworker\u8282\u70b9\u4e0d\u80fd\u901a\u8baf<\/h1>\n<p>\u5982\u679c\u662f\u4ee5client\uff0c\u800c\u4e0d\u662fcluser\u6a21\u5f0f\u63d0\u4ea4\u6211\u4eec\u7684\u7a0b\u5e8f\uff08spark-submit \u7684&#8211;deploy-mode \u53c2\u6570\uff09\uff0c\u90a3\u4e48\u6211\u4eec\u7a0b\u5e8f\u6240\u5728\u7684driver\u8282\u70b9\uff0c\u548cspark\u7684worker\u8282\u70b9\u4e4b\u95f4\u9700\u8981\u7f51\u7edc\u901a\u8baf\uff0c\u5982\u679c\u76f8\u5e94\u7684ip\u8bbe\u7f6e\u4e0d\u6b63\u786e\uff0c\u4f1a\u5bfc\u81f4\u4e0d\u80fd\u6b63\u5e38\u6267\u884c\u4efb\u52a1\u3002<\/p>\n<ul>\n<li>\u542f\u52a8master\u8282\u70b9\u65f6\uff0c\u8981\u8bbe\u7f6eSPARK_LOCAL_IP\u73af\u5883\u53d8\u91cf\uff0c\u4ee5\u53ca\u7528-h\u53c2\u6570\u6307\u5b9a\u7ed1\u5b9a\u7684ip\u3002<\/li>\n<\/ul>\n<pre class=\"brush: bash; title: ; notranslate\" title=\"\">\r\n  SPARK_LOCAL_IP=172.31.0.53 .\/sbin\/start-master.sh -h 172.31.0.53\r\n<\/pre>\n<ul>\n<li>\u542f\u52a8slave\u8282\u70b9\u65f6\uff0c\u8981\u8bbe\u7f6eSPARK_LOCAL_IP\u73af\u5883\u53d8\u91cf\uff0c\u5e76\u6307\u5b9amaster\u65f6\u4f7f\u7528\u6b63\u786eip\u3002<\/li>\n<\/ul>\n<pre class=\"brush: bash; title: ; notranslate\" title=\"\">\r\n  SPARK_LOCAL_IP=172.31.0.21 .\/sbin\/start-slave.sh spark:\/\/172.31.0.53:7077\r\n<\/pre>\n<ul>\n<li>\u5728spark-submit\u7684\u65f6\u5019\uff0c\u8981\u6307\u5b9aSPARK_LOCAL_HOSTNAME\u73af\u5883\u53d8\u91cf\uff0c\u8bbe\u7f6e\u4e3a\u6b63\u786e\u7684ip\u3002<\/li>\n<li>\u5728woker\u4e0edriver\u8282\u70b9\uff0c\u68c0\u67e5\u4e00\u4e0bmaster\u3001slave\u7b49\u8282\u70b9\u7684\u7f51\u7edc\u7aef\u53e3\uff0c\u770b\u80fd\u4e0d\u80fd\u6b63\u5e38\u8fde\u63a5\u3002<\/li>\n<\/ul>\n<h1>\u4f7f\u7528python\u865a\u62df\u73af\u5883<\/h1>\n<p>\u5728driver\u8282\u70b9\uff0c\u901a\u8fc7spark-submit\u63d0\u4ea4py\u811a\u672c\u65f6\uff0c\u5982\u679c\u6211\u4eec\u5f00\u53d1\u65f6\u662f\u7528\u4e86virtualenv\u521b\u5efa\u4e86\u865a\u62df\u73af\u5883\uff0c\u90a3\u4e48\u63d0\u4ea4spark job\u65f6\uff0c\u4e5f\u9700\u8981\u4f7f\u7528\u540c\u4e00\u4e2aenv\u73af\u5883\uff0c\u53ef\u4ee5\u901a\u8fc7\u73af\u5883\u53d8\u91cf\u6765\u8bbe\u7f6e<\/p>\n<pre class=\"brush: bash; title: ; notranslate\" title=\"\">\r\nPYSPARK_DRIVER_PYTHON=\/home\/ops\/spark\/env\/bin\/python \r\n<\/pre>\n<h1>\u6253\u5305python\u4f9d\u8d56<\/h1>\n<p>\u5982\u679c\u6211\u4eec\u7684python\u7a0b\u5e8f\u4f9d\u8d56\u4e86\u5f88\u591a\u7b2c\u4e09\u65b9\u6a21\u5757\uff0c\u5728\u865a\u62df\u73af\u5883\u4e2d\u901a\u8fc7pip\u88c5\u4e86\u5f88\u591a\u6a21\u5757\uff0c\u5219\u5728\u63d0\u4ea4spark\u4efb\u52a1\u7684\u65f6\u5019\uff0c\u9700\u8981\u5c06\u8fd9\u4e9b\u4f9d\u8d56\u5305\u63d0\u4ea4\u7ed9spark worker\u3002<\/p>\n<p>\u9996\u5148\u628a\u4f9d\u8d56\u6253\u5305\u5230\u4e00\u4e2azip\u6587\u4ef6\u4e2d\uff1a<\/p>\n<pre class=\"brush: bash; title: ; notranslate\" title=\"\">\r\npip install -t dependencies -r requirements.txt\r\ncd dependencies\r\nzip -r ..\/dependencies.zip .\r\n<\/pre>\n<p>\u7136\u540e\u5728spark-submit\u65f6\u6dfb\u52a0\u53c2\u6570<\/p>\n<pre class=\"brush: bash; title: ; notranslate\" title=\"\">\r\nspark-submit --py-files dependencies.zip spark_job.py\r\n<\/pre>\n<h1>\u90e8\u7f72spark\u7684\u673a\u5668\u4e0d\u80fd\u8fde\u63a5\u5916\u7f51<\/h1>\n<p>\u8fd0\u884cspark-submit\u65f6\uff0cspark\u4f1a\u4ece\u7f51\u4e0a\u4e0b\u8f7d\u4e00\u4e9b\u4f9d\u8d56jar\u5305\uff0c\u6bd4\u5982\u4f60\u6307\u5b9a\u4e86cassandra\u6216kafka\u7684\u6a21\u5757\u65f6\uff0c\u5982\u679c\u6211\u4eec\u7684server\u6ca1\u6709\u8bbf\u95ee\u516c\u7f51\u7684\u6743\u9650\uff0c\u8fd9\u65f6\u5019\u547d\u4ee4\u5c31\u88ab\u5361\u5728\u4e00\u4e2a\u5730\u65b9\u5f88\u4e45\u3002<\/p>\n<p>\u540e\u9762\u6211\u89c2\u5bdf\u65e5\u5fd7\u53d1\u73b0\u4e86\u4e00\u4e2ajar\u5305\u6587\u4ef6\u5939\uff0c~\/.ivy2 \uff0c\u4e0b\u9762\u6709jars\u548ccache\u4e24\u4e2a\u6587\u4ef6\u5939\uff0c\u4e8e\u662f\u6211\u628a\u81ea\u5df1\u5f00\u53d1\u673a\u4e2d\u8fd9\u4e2a\u6587\u4ef6\u5939\u7684\u6240\u6709\u6587\u4ef6\uff0ccopy\u5230\u90e8\u7f72\u7684\u673a\u5668\u4e0a\u53bb\uff0c\u8fd9\u65f6\u5019spark\u4f1a\u4ececache\u6587\u4ef6\u5939\u4e2d\u67e5\u627e\uff0c\u627e\u5230\u4e86\u76f8\u5e94\u7684\u6587\u4ef6\uff0c\u5c31\u4e0d\u4f1a\u518d\u4ece\u4e92\u8054\u7f51\u4e0b\u8f7djar\u5305\u4e86\uff0c\u89e3\u51b3\u4e86\u8fd9\u4e2a\u95ee\u9898\u3002<\/p>\n<h1>\u8fd0\u884c spark \u811a\u672c\u65f6 google.protobuf \u5305\u627e\u4e0d\u5230<\/h1>\n<p>\u8fd0\u884cspark\u4efb\u52a1\u65f6\uff0c\u5728worker\u8282\u70b9\u4f1a\u62a5\u9519\uff1a<\/p>\n<pre class=\"brush: bash; title: ; notranslate\" title=\"\">\r\nImportError: No module named google.protobuf\r\n<\/pre>\n<p>\u867d\u7136\u6211\u4eec\u901a\u8fc7\u6307\u5b9a &#8211;py-files dependencies.zip \u6253\u5305\u4e86\u6240\u9700\u7684\u4f9d\u8d56\u5305\uff0c\u4f46\u59cb\u7ec8\u8fd8\u662f\u627e\u4e0d\u5230\u5305 google.protobuf\u3002<\/p>\n<p>\u540e\u6765\u67e5\u4e86\u4e0b\uff0c\u539f\u6765 google.protobuf \u8fd9\u4e2a\u5305\u6bd4\u8f83\u7279\u6b8a\uff0c\u5b83\u7684\u8def\u5f84\u662f\u901a\u8fc7 site-packages\/protobuf-*.pth \u6587\u4ef6\u914d\u7f6e\u7684\uff0csite-packages\/google \u6587\u4ef6\u5939\u5e76\u4e0d\u662f\u4e2a\u666e\u901a\u7684python\u5305\uff0c\u91cc\u9762\u6ca1\u6709 __init__.py\u3002<\/p>\n<p>\u6240\u4ee5\u53ea\u6709python\u8fdb\u7a0b\u52a0\u8f7d protobuf-*.pth \u914d\u7f6e\u6587\u4ef6\u540e\uff0c\u624d\u80fd\u627e\u5230\u5305 google.protobuf\uff0c\u800c\u52a0\u8f7d\u8fd9\u4e2a\u6587\u4ef6\u662f\u5728python\u8fdb\u7a0b\u542f\u52a8\u65f6\u3002<\/p>\n<p>\u603b\u4e4b\uff0cdependencies.zip \u8fd9\u79cd\u65b9\u5f0f\uff0c\u65e0\u6cd5\u89e3\u6cd5 pth \u6587\u4ef6\u7684\u5305\u8def\u5f84\u95ee\u9898\uff0c\u6700\u7ec8\u8fd8\u662f\u5728 worker \u8282\u70b9\u673a\u5668\u4e0a\uff0c\u901a\u8fc7 pip \u5168\u5c40\u5b89\u88c5\u4e86 protobuf \u5305\u3002<\/p>\n","protected":false},"excerpt":{"rendered":"<p>\u5982\u4f55\u8ba9pyspark\u4f7f\u7528ipython vim .\/bin\/pyspark \uff0c\u5728\u6700\u4e0a\u9762\u52a0\u4e00\u884c export P [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"closed","ping_status":"closed","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[5],"tags":[],"class_list":["post-6656","post","type-post","status-publish","format-standard","hentry","category-diary"],"_links":{"self":[{"href":"https:\/\/kyle.ai\/blog\/wp-json\/wp\/v2\/posts\/6656","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/kyle.ai\/blog\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/kyle.ai\/blog\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/kyle.ai\/blog\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/kyle.ai\/blog\/wp-json\/wp\/v2\/comments?post=6656"}],"version-history":[{"count":5,"href":"https:\/\/kyle.ai\/blog\/wp-json\/wp\/v2\/posts\/6656\/revisions"}],"predecessor-version":[{"id":6814,"href":"https:\/\/kyle.ai\/blog\/wp-json\/wp\/v2\/posts\/6656\/revisions\/6814"}],"wp:attachment":[{"href":"https:\/\/kyle.ai\/blog\/wp-json\/wp\/v2\/media?parent=6656"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/kyle.ai\/blog\/wp-json\/wp\/v2\/categories?post=6656"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/kyle.ai\/blog\/wp-json\/wp\/v2\/tags?post=6656"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}