# Install the latest release of Haystack in your own environment
#! pip install farm-haystack
# Install the latest main of Haystack
!pip install --upgrade pip
!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab]
# Imports needed to run this notebook
from pprint import pprint
from tqdm import tqdm
from haystack.nodes import QuestionGenerator, BM25Retriever, FARMReader
from haystack.document_stores import ElasticsearchDocumentStore
from haystack.pipelines import (
QuestionGenerationPipeline,
RetrieverQuestionGenerationPipeline,
QuestionAnswerGenerationPipeline,
)
from haystack.utils import launch_es, print_questions
# Option 2: In Colab / No Docker environments: Start Elasticsearch from source
! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
! chown -R daemon:daemon elasticsearch-7.9.2
import os
from subprocess import Popen, PIPE, STDOUT
es_server = Popen(
["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1) # as daemon
)
# wait until ES has started
! sleep 30
!wget https://dl.xpdfreader.com/xpdf-tools-linux-4.04.tar.gz
!tar -xvf xpdf-tools-linux-4.04.tar.gz && sudo cp xpdf-tools-linux-4.04/bin64/pdftotext /usr/local/bin
from haystack.nodes import TextConverter, PDFToTextConverter, DocxToTextConverter, PreProcessor
converter = TextConverter(remove_numeric_tables=True, valid_languages=["nl"])
doc_txt = converter.convert(file_path="/content/data/Chatbot_BVO DDK_22092022.txt", meta=None)[0]
from haystack.nodes import PreProcessor
# This is a default usage of the PreProcessor.
# Here, it performs cleaning of consecutive whitespaces
# and splits a single large document into smaller documents.
# Each document is up to 1000 words long and document breaks cannot fall in the middle of sentences
# Note how the single document passed into the document gets split into 5 smaller documents
preprocessor = PreProcessor(
clean_empty_lines=True,
clean_whitespace=True,
clean_header_footer=False,
split_by="word",
split_length=100,
split_respect_sentence_boundary=True,
language="nl",
)
docs = preprocessor.process([doc_txt])
print(f"n_docs_input: 1\nn_docs_output: {len(docs)}")
# Initialize Question Generator
question_generator = QuestionGenerator()
# Fill the document store with a German document.
document_store.write_documents(docs)
# Load machine translation models
from haystack.nodes import TransformersTranslator
in_translator = TransformersTranslator(model_name_or_path="Helsinki-NLP/opus-mt-nl-en")
out_translator = TransformersTranslator(model_name_or_path="Helsinki-NLP/opus-mt-en-nl")
reader = FARMReader("deepset/roberta-base-squad2")
qag_pipeline = QuestionAnswerGenerationPipeline(question_generator, reader)
# Wrap the previously defined QuestionAnswerGenerationPipeline
from haystack.pipelines import TranslationWrapperPipeline
pipeline_with_translation = TranslationWrapperPipeline(
input_translator=in_translator, output_translator=out_translator, pipeline=qag_pipeline
)
for idx, document in enumerate(tqdm(document_store)):
print(f"\n * Generating questions and answers for document {idx}: {document.content[:100]}...\n")
result = pipeline_with_translation.run(documents=[document])
print_questions(result)
Comments