package org.culturegraph.cluster.job.text;

import java.io.IOException;
import java.io.StringReader;
import joptsimple.internal.Strings;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.reduce.IntSumReducer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.culturegraph.cluster.util.AbstractJobLauncher;
import org.culturegraph.cluster.util.ConfigConst;

/* loaded from: input_file:lodmill-rd-0.1.0-SNAPSHOT-jar-with-dependencies.jar:org/culturegraph/cluster/job/text/TokenCounter.class */
public final class TokenCounter extends AbstractJobLauncher {

    /* loaded from: input_file:lodmill-rd-0.1.0-SNAPSHOT-jar-with-dependencies.jar:org/culturegraph/cluster/job/text/TokenCounter$AnalyserMapper.class */
    static final class AnalyserMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
        private static final String SEPARATOR = "\t";
        private static final String NAME = AnalyserMapper.class.getSimpleName();
        private static final IntWritable ONE = new IntWritable(1);
        private final Text tokenText = new Text();
        private final Analyzer analyzer = new MyGermanAnalyzer();

        /* JADX INFO: Access modifiers changed from: protected */
        @Override // org.apache.hadoop.mapreduce.Mapper
        public void map(LongWritable longWritable, Text text, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
            int find = text.find("\t");
            int find2 = text.find("\t", find + 1);
            String decode = Text.decode(text.getBytes(), find, find2 - find);
            TokenStream tokenStream = this.analyzer.tokenStream(null, new StringReader(Text.decode(text.getBytes(), find2, text.getLength() - find2)));
            CharTermAttribute charTermAttribute = (CharTermAttribute) tokenStream.addAttribute(CharTermAttribute.class);
            while (tokenStream.incrementToken()) {
                this.tokenText.set(charTermAttribute.toString());
                context.write(this.tokenText, ONE);
                context.getCounter(NAME, "tokens counted").increment(1L);
            }
            context.getCounter(NAME, decode).increment(1L);
        }
    }

    public static void main(String[] strArr) {
        launch(new TokenCounter(), strArr);
    }

    @Override // org.culturegraph.cluster.util.AbstractJobLauncher
    protected Configuration prepareConf(Configuration configuration) {
        setJobName("Count tokens in '" + getConf().get(ConfigConst.INPUT_PATH) + Strings.SINGLE_QUOTE);
        addRequiredArguments(ConfigConst.OUTPUT_PATH, ConfigConst.INPUT_PATH);
        return getConf();
    }

    @Override // org.culturegraph.cluster.util.AbstractJobLauncher
    protected void configureJob(Job job, Configuration configuration) throws IOException {
        configureFileInputMapper(job, configuration, AnalyserMapper.class, Text.class, IntWritable.class);
        job.setCombinerClass(IntSumReducer.class);
        configureTextOutputReducer(job, configuration, IntSumReducer.class);
    }
}
