5 Star 13 Fork 0

eminem89 / Hbase-ElasticSearch-Init

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
克隆/下载
贡献代码
同步代码
取消
提示: 由于 Git 不支持空文件夾,创建文件夹后会生成空的 .keep 文件
Loading...
README
Artistic-2.0

使用Mapreduce将hbase表中的数据全量导入ElasticSearch

对于做Hbase+ElasticSearch的项目来说,数据同步以及初始化Hbase中的数据到Elasticsearch都是经常要做的事情,我在之前的博文中已经介绍过如何自己编写一个Hbase组件来做ElasticSearch的同步,那么今天我想介绍一下,如何全量的将数据一次性从Hbase中同步到ElasticSearch中去。当然,我们需要用到一个MapReduce即可。 完整代码托管在码云上面,点击查看完整代码,欢迎到我的CSDN博客留言点击连接到我的csdn博客

  • 编写导入数据到ElasticSearch的Mapreduce
  • 运行Mapreduce
  • 总结
  • 编写导入数据到ElasticSearch的Mapreduce 我们写mapreduce之前得考虑一下导入的逻辑,Hbase中有一张目标表A,然后ES中有一个目标索引B。我们需要将A表中的数据全量的导入到B索引中去。那么,在map阶段之前,我们需要先scan A表中的全量数据,然后利用map阶段读取到这些数据,并将这些数据通过ES的java api bulk到B索引中去即可,因为我们不需要做分析统计这些数据,只要拿到数据进行bulk即可,所以不需要reducer。一个map就可以搞定。关键是,mapreduce支持直接从hbase读取数据,我们在定义mapper的时候需要继承一个TableMapper类,这样我们就可以轻松搞定了,好了,话不多说,请看关键代码。
package org.eminem.hadoop.mapper;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.NullWritable;
import org.elasticsearch.action.bulk.BulkRequestBuilder;
import org.elasticsearch.action.bulk.BulkResponse;
import org.elasticsearch.action.update.UpdateRequestBuilder;
import org.elasticsearch.client.Client;
import org.eminem.elasticsearch.ESClient;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

/**
 * ES init mapper
 */
public class ESInitMapper extends TableMapper<NullWritable, NullWritable> {

    private Client client;
    private static final Log LOG = LogFactory.getLog(ESInitMapper.class);
    private BulkRequestBuilder bulkRequestBuilder;
    private String index;
    private String columnFamily;

    /**
     * init ES
     * @param context
     * @throws IOException
     * @throws InterruptedException
     */
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        try {
            String clusterName = context.getConfiguration().get("es.cluster.name");
            String host = context.getConfiguration().get("es.cluster.host");
            int port = Integer.parseInt(context.getConfiguration().get("es.cluster.port"));
            client = ESClient.getEsClient(clusterName,host,port);
            bulkRequestBuilder = client.prepareBulk();
            bulkRequestBuilder.setRefresh(true);
            index = context.getConfiguration().get("index");
            columnFamily = context.getConfiguration().get("columnFamily");
        } catch (Exception ex) {
            LOG.error("init ES Client error:" + ex.getMessage());
        }
    }

    @Override
    protected void map(ImmutableBytesWritable key, Result values, Context context) throws IOException, InterruptedException {
        Map<String, Object> json = new HashMap<String, Object>();
        Map<String, Object> infoJson = new HashMap<String, Object>();
        String rowKey = Bytes.toString(values.getRow());
        for (KeyValue kv : values.raw()) {
            String keyHbase = Bytes.toString(kv.getQualifier());
            String valueHbase = Bytes.toString(kv.getValue());
            json.put(keyHbase, valueHbase);
        }
        // set Family(you can do not set or change zhe Family name)
        infoJson.put(columnFamily, json);
        addUpdateBuilderToBulk(client.prepareUpdate(index, index, rowKey).setDocAsUpsert(true).setDoc(infoJson));
    }

    @Override
    protected void cleanup(Context context) throws IOException, InterruptedException {
        if (0 < bulkRequestBuilder.numberOfActions()) {
            try {
                bulkRequest();
            } catch (Exception ex) {
                LOG.error("Bulk " + index + "index error :" + ex.getMessage());
            }
        }
        //关闭es
        client.close();
    }

    /**
     * execute bulk process
     * @throws Exception
     */
    private void bulkRequest() throws Exception {
        BulkResponse bulkResponse = bulkRequestBuilder.execute().actionGet();
        if (!bulkResponse.hasFailures()) {
            bulkRequestBuilder = client.prepareBulk();
        }
    }

    /**
     *add prepare update date  to builder
     * @param builder
     */
    private synchronized void addUpdateBuilderToBulk(UpdateRequestBuilder builder) {
        try {
            if (bulkRequestBuilder.numberOfActions() != 0 && (bulkRequestBuilder.numberOfActions() % 500 == 0)) {
                bulkRequest();
            }
            bulkRequestBuilder.add(builder);
        } catch (Exception ex) {
            LOG.error("Bulk" + index + "index error :" + ex.getMessage());
        }
    }
}

 

上面就是mapper的主要逻辑,ESInitMapper extends TableMapper,由于没有reducer所以,mapper的keyout和valueout都是nullwritable,然后再map方法中,拿到的就是hbase表中一行记录,对这行记录进行组装之后,放到一个缓冲池中,如果缓冲池中超过500的阀值,那么就会进行一次bulk。这样做的目的就是,不会过于频繁的写入ES导致性能有问题,而是批量的进行bulk。下面还有一段关键的代码,就是定义job以及如何启动job,因为平时我们写mapreduce的时候,都是读取hdfs上的文件。但是,今天我们读取的是Hbase中的表,到底要怎么写呢?请看下面这段代码。

 package org.eminem.hadoop;
 
 import org.apache.commons.lang.StringUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hbase.client.Scan;
 import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
 import org.apache.hadoop.mapreduce.Job;
 import org.eminem.hadoop.mapper.ESInitMapper;
 import org.eminem.hbase.HbaseClient;
 import org.eminem.utils.PropertiesUtil;
 
 import java.io.IOException;
 
 /**
  * Call Es init job
  */
 public class ESInitCall {
 
     public static void main(String args[]) throws Exception {
         String index = args[0];
         String tableName = args[1];
         String columnFamily = args[2];
         if (StringUtils.isBlank(index) || StringUtils.isBlank(tableName) || StringUtils.isBlank(columnFamily)) {
             System.err.println("please input ES index or hbase tableName or hbase table's columnFamily as three parameters");
             System.exit(2);
         }
         System.out.println("target index:" + index);
         System.out.println("target table:" + tableName);
         System.out.println("target columnFamily:" + columnFamily);
         System.out.println("es.cluster.name:" + PropertiesUtil.getStringByKey("es.cluster.name"));
         System.out.println("es.cluster.host:" + PropertiesUtil.getStringByKey("es.cluster.port"));
         System.out.println("es.cluster.host:" + PropertiesUtil.getStringByKey("es.cluster.host"));
         System.out.println("hbase.zookeeper.quorum:" + PropertiesUtil.getStringByKey("hbase.zookeeper.quorum"));
         System.out.println("hbase.zookeeper.property.clientPort:"+ PropertiesUtil.getStringByKey("hbase.zookeeper.property.clientPort"));
         Configuration conf = HbaseClient.getConfiguration(PropertiesUtil.getStringByKey("hbase.zookeeper.quorum"), PropertiesUtil.getStringByKey("hbase.zookeeper.property.clientPort"));
         conf.set("index", index);
         conf.set("columnFamily", columnFamily);
         conf.set("es.cluster.name", PropertiesUtil.getStringByKey("es.cluster.name"));
         conf.set("es.cluster.port", PropertiesUtil.getStringByKey("es.cluster.port"));
         conf.set("es.cluster.host", PropertiesUtil.getStringByKey("es.cluster.host"));
         Job job = Job.getInstance(conf, "ESInit " + index + " Call Job");
         job.setJarByClass(ESInitCall.class);
         Scan scan = new Scan();
         scan.setCaching(500); // 1 is the default in Scan, which will be bad for MapReduce jobs
         scan.setCacheBlocks(false); // don't set to true for MR jobs
         // 设置map过程
         TableMapReduceUtil.initTableMapperJob(
                 tableName,      // input table
                 scan,              // Scan instance to control CF and attribute selection
                 ESInitMapper.class,   // mapper class
                 null,              // mapper output key
                 null,              // mapper output value
                 job);
         TableMapReduceUtil.initTableReducerJob(
                 tableName,      // output table
                 null,             // reducer class
                 job);
         job.setNumReduceTasks(0);
         boolean b = job.waitForCompletion(true);
         if (!b) {
             throw new IOException("error with job!");
         }
     }
 }

 

scan.setCaching(500); // 这里是设置hbase的缓存大小。 scan没有加任何条件,所以说scan的全量的数据,但是如果全量数据量很大,怎么办?所以我们要设置缓冲500,setCaching设置的值为每次rpc的请求记录数(每次从服务器端读取的行数,默认为配置文件中设置的值),默认是1,cache大可以优化性能,但是太大了会花费很长的时间进行一次传输。

scan.setCacheBlocks(false); // don't set to true for MR jobs 为是否缓存块,默认缓存,我们分内存,缓存和磁盘,三种方式,一般数据的读取为内存->缓存->磁盘,当MR的时候为非热点数据,因此不需要缓存。

// 设置map过程 TableMapReduceUtil.initTableMapperJob( tableName, // input table scan, // Scan instance to control CF and attribute selection ESInitMapper.class, // mapper class null, // mapper output key null, // mapper output value job); 直接调用TableMapReduceUtil来初始化Mapper,第一个参数表示要读取hbase中的表名称,第二个参数,scan就是扫描这张表,可以根据事先定义好的scan对数据进行过滤操作,接下来四个参数就很好理解了,分别是mapper的class,mapper的输出key值,以及mapper的输出value,最后就是job。由于这个场景前面说到过,我们不需要reducer阶段,所以mapper的输出key值=null,以及mapper的输出value=null即可。

TableMapReduceUtil.initTableReducerJob( tableName, // output table null, // reducer class job); job.setNumReduceTasks(0);

  • 运行Mapreduce

maven 打包

mvn clean package

执行shell命令

 hadoop jar hbase-elasticsearch-init-1.0-SNAPSHOT-mapreduce.jar  testIndex testTableName  hbaseColumnFamily
 
  • 总结

这是一个很简单的mapreduce,主要就是讲述如何从hbase中读取数据,然后进行mapreduce作业,我们可以根据实际的场景,使用TableMapReduceUtil这个工具类来初始化mapper和reducer,并根据scan来过滤提取想要的数据。以下就是读取hbase中的数据来进行mapreduce的步骤。

1、编写Mapper继承TableMapper。 2、如果有Reducer那么需要编写Reducer继承TableReducer。 3、使用TableMapReduceUtil工具类来初始化mapper和reducer。 4、在初始化mapper之前,使用scan来对hbase中的表进行过滤和提取。

Artistic License 2.0 Copyright (c) <year> <fullname> Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble This license establishes the terms under which a given free software Package may be copied, modified, distributed, and/or redistributed. The intent is that the Copyright Holder maintains some artistic control over the development of that Package while still keeping the Package available as open source and free software. You are always permitted to make arrangements wholly outside of this license directly with the Copyright Holder of a given Package. If the terms of this license do not permit the full use that you propose to make of the Package, you should contact the Copyright Holder and seek a different licensing arrangement. Definitions "Copyright Holder" means the individual(s) or organization(s) named in the copyright notice for the entire Package. "Contributor" means any party that has contributed code or other material to the Package, in accordance with the Copyright Holder's procedures. "You" and "your" means any person who would like to copy, distribute, or modify the Package. "Package" means the collection of files distributed by the Copyright Holder, and derivatives of that collection and/or of those files. A given Package may consist of either the Standard Version, or a Modified Version. "Distribute" means providing a copy of the Package or making it accessible to anyone else, or in the case of a company or organization, to others outside of your company or organization. "Distributor Fee" means any fee that you charge for Distributing this Package or providing support for this Package to another party. It does not mean licensing fees. "Standard Version" refers to the Package if it has not been modified, or has been modified only in ways explicitly requested by the Copyright Holder. "Modified Version" means the Package, if it has been changed, and such changes were not explicitly requested by the Copyright Holder. "Original License" means this Artistic License as Distributed with the Standard Version of the Package, in its current version or as it may be modified by The Perl Foundation in the future. "Source" form means the source code, documentation source, and configuration files for the Package. "Compiled" form means the compiled bytecode, object code, binary, or any other form resulting from mechanical transformation or translation of the Source form. Permission for Use and Modification Without Distribution (1) You are permitted to use the Standard Version and create and use Modified Versions for any purpose without restriction, provided that you do not Distribute the Modified Version. Permissions for Redistribution of the Standard Version (2) You may Distribute verbatim copies of the Source form of the Standard Version of this Package in any medium without restriction, either gratis or for a Distributor Fee, provided that you duplicate all of the original copyright notices and associated disclaimers. At your discretion, such verbatim copies may or may not include a Compiled form of the Package. (3) You may apply any bug fixes, portability changes, and other modifications made available from the Copyright Holder. The resulting Package will still be considered the Standard Version, and as such will be subject to the Original License. Distribution of Modified Versions of the Package as Source (4) You may Distribute your Modified Version as Source (either gratis or for a Distributor Fee, and with or without a Compiled form of the Modified Version) provided that you clearly document how it differs from the Standard Version, including, but not limited to, documenting any non-standard features, executables, or modules, and provided that you do at least ONE of the following: (a) make the Modified Version available to the Copyright Holder of the Standard Version, under the Original License, so that the Copyright Holder may include your modifications in the Standard Version. (b) ensure that installation of your Modified Version does not prevent the user installing or running the Standard Version. In addition, the Modified Version must bear a name that is different from the name of the Standard Version. (c) allow anyone who receives a copy of the Modified Version to make the Source form of the Modified Version available to others under (i) the Original License or (ii) a license that permits the licensee to freely copy, modify and redistribute the Modified Version using the same licensing terms that apply to the copy that the licensee received, and requires that the Source form of the Modified Version, and of any works derived from it, be made freely available in that license fees are prohibited but Distributor Fees are allowed. Distribution of Compiled Forms of the Standard Version or Modified Versions without the Source (5) You may Distribute Compiled forms of the Standard Version without the Source, provided that you include complete instructions on how to get the Source of the Standard Version. Such instructions must be valid at the time of your distribution. If these instructions, at any time while you are carrying out such distribution, become invalid, you must provide new instructions on demand or cease further distribution. If you provide valid instructions or cease distribution within thirty days after you become aware that the instructions are invalid, then you do not forfeit any of your rights under this license. (6) You may Distribute a Modified Version in Compiled form without the Source, provided that you comply with Section 4 with respect to the Source of the Modified Version. Aggregating or Linking the Package (7) You may aggregate the Package (either the Standard Version or Modified Version) with other packages and Distribute the resulting aggregation provided that you do not charge a licensing fee for the Package. Distributor Fees are permitted, and licensing fees for other components in the aggregation are permitted. The terms of this license apply to the use and Distribution of the Standard or Modified Versions as included in the aggregation. (8) You are permitted to link Modified and Standard Versions with other works, to embed the Package in a larger work of your own, or to build stand-alone binary or bytecode versions of applications that include the Package, and Distribute the result without restriction, provided the result does not expose a direct interface to the Package. Items That are Not Considered Part of a Modified Version (9) Works (including, but not limited to, modules and scripts) that merely extend or make use of the Package, do not, by themselves, cause the Package to be a Modified Version. In addition, such works are not considered parts of the Package itself, and are not subject to the terms of this license. General Provisions (10) Any use, modification, and distribution of the Standard or Modified Versions is governed by this Artistic License. By using, modifying or distributing the Package, you accept this license. Do not use, modify, or distribute the Package, if you do not accept this license. (11) If your Modified Version has been derived from a Modified Version made by someone other than you, you are nevertheless required to ensure that your Modified Version complies with the requirements of this license. (12) This license does not grant you the right to use any trademark, service mark, tradename, or logo of the Copyright Holder. (13) This license includes the non-exclusive, worldwide, free-of-charge patent license to make, have made, use, offer to sell, sell, import and otherwise transfer the Package with respect to any patent claims licensable by the Copyright Holder that are necessarily infringed by the Package. If you institute patent litigation (including a cross-claim or counterclaim) against any party alleging that the Package constitutes direct or contributory patent infringement, then this Artistic License to you shall terminate on the date that such litigation is filed. (14) Disclaimer of Warranty: THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS "AS IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES. THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY YOUR LOCAL LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

简介

利用mapreduce全量将hbase表中的数据导入到ElasticSearch 展开 收起
Java
Artistic-2.0
取消

发行版

暂无发行版

贡献者

全部

近期动态

加载更多
不能加载更多了
Java
1
https://gitee.com/eminem89/Hbase-ElasticSearch-Init.git
git@gitee.com:eminem89/Hbase-ElasticSearch-Init.git
eminem89
Hbase-ElasticSearch-Init
Hbase-ElasticSearch-Init
master

搜索帮助

14c37bed 8189591 565d56ea 8189591