package org.culturegraph.cluster.job.ingest;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
import org.culturegraph.cluster.inputformat.XmlInputFormat;
import org.culturegraph.cluster.sink.ComplexPutWriter;
import org.culturegraph.cluster.util.AbstractJobLauncher;
import org.culturegraph.cluster.util.Column;
import org.culturegraph.cluster.util.ConfigConst;
import org.culturegraph.mediawiki.converter.WikiStreamAnalyzer;
import org.culturegraph.metamorph.core.Metamorph;

/* loaded from: input_file:lodmill-rd-0.1.0-SNAPSHOT-jar-with-dependencies.jar:org/culturegraph/cluster/job/ingest/WikipediaIngest.class */
public final class WikipediaIngest extends AbstractJobLauncher {
    private static final String START_TAG = "<page>";
    private static final String END_TAG = "</page>";

    /* loaded from: input_file:lodmill-rd-0.1.0-SNAPSHOT-jar-with-dependencies.jar:org/culturegraph/cluster/job/ingest/WikipediaIngest$IngestMapper.class */
    static final class IngestMapper<K, V> extends Mapper<LongWritable, Text, K, V> {
        private static final String WIKIPEDIA = "Wikipedia";
        private static final String ANALYZER_CONFIG = "mediawiki/analyzer.conf";
        private WikiStreamAnalyzer wikiAnalyzer;
        private final ComplexPutWriter putWriter = new ComplexPutWriter();
        private boolean storeRawData;
        private HTable htable;

        IngestMapper() {
        }

        /* JADX INFO: Access modifiers changed from: protected */
        /* JADX WARN: Multi-variable type inference failed */
        @Override // org.apache.hadoop.mapreduce.Mapper
        public void setup(Mapper<LongWritable, Text, K, V>.Context context) throws IOException, InterruptedException {
            super.setup(context);
            Configuration configuration = context.getConfiguration();
            this.wikiAnalyzer = new WikiStreamAnalyzer(ANALYZER_CONFIG);
            this.storeRawData = configuration.getBoolean(ConfigConst.STORE_RAW_DATA, false);
            if (configuration.get(ConfigConst.MORPH_DEF) == null) {
                this.wikiAnalyzer.setReceiver((WikiStreamAnalyzer) this.putWriter);
            } else {
                ((Metamorph) this.wikiAnalyzer.setReceiver((WikiStreamAnalyzer) new Metamorph(configuration.get(ConfigConst.MORPH_DEF)))).setReceiver((Metamorph) this.putWriter);
            }
            this.htable = new HTable(configuration, configuration.get(ConfigConst.OUTPUT_TABLE));
            this.htable.setAutoFlush(false);
        }

        /* JADX INFO: Access modifiers changed from: protected */
        /* JADX WARN: Multi-variable type inference failed */
        @Override // org.apache.hadoop.mapreduce.Mapper
        public void cleanup(Mapper<LongWritable, Text, K, V>.Context context) throws IOException, InterruptedException {
            super.cleanup(context);
            this.htable.flushCommits();
            this.htable.close();
        }

        @Override // org.apache.hadoop.mapreduce.Mapper
        public void map(LongWritable longWritable, Text text, Mapper<LongWritable, Text, K, V>.Context context) throws IOException {
            context.getCounter(WIKIPEDIA, "articles processed").increment(1L);
            this.wikiAnalyzer.process((Reader) new StringReader(text.toString()));
            Put currentPut = this.putWriter.getCurrentPut();
            if (currentPut == null) {
                context.getCounter(WIKIPEDIA, "empty put").increment(1L);
                return;
            }
            if (this.storeRawData) {
                currentPut.add(Column.Family.RAW, Column.Name.WIKI, text.getBytes());
            }
            if (!currentPut.isEmpty()) {
                this.htable.put(currentPut);
            }
            context.getCounter(WIKIPEDIA, "articles containing text").increment(1L);
        }
    }

    public static void main(String[] strArr) {
        WikipediaIngest wikipediaIngest = new WikipediaIngest();
        wikipediaIngest.addRequiredArguments(ConfigConst.OUTPUT_TABLE, ConfigConst.INPUT_PATH);
        wikipediaIngest.addOptionalArguments(ConfigConst.MORPH_DEF);
        launch(wikipediaIngest, strArr);
    }

    @Override // org.culturegraph.cluster.util.AbstractJobLauncher
    protected Configuration prepareConf(Configuration configuration) {
        configuration.set(XmlInputFormat.START_TAG_KEY, START_TAG);
        configuration.set(XmlInputFormat.END_TAG_KEY, END_TAG);
        configuration.setIfUnset("mapred.map.tasks.speculative.execution", "false");
        return HBaseConfiguration.create(configuration);
    }

    @Override // org.culturegraph.cluster.util.AbstractJobLauncher
    protected void configureJob(Job job, Configuration configuration) throws IOException {
        FileInputFormat.addInputPath(job, new Path(configuration.get(ConfigConst.INPUT_PATH)));
        job.setInputFormatClass(XmlInputFormat.class);
        job.setMapperClass(IngestMapper.class);
        job.setNumReduceTasks(0);
        job.setOutputFormatClass(NullOutputFormat.class);
    }
}
