Generate Questions from .txt NL-EN-NL
Mon Nov 21 2022 17:28:49 GMT+0000 (UTC)
Saved by @edubrigham #ffmpeg #audio #wav #convert
# Install the latest release of Haystack in your own environment #! pip install farm-haystack # Install the latest main of Haystack !pip install --upgrade pip !pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab] # Imports needed to run this notebook from pprint import pprint from tqdm import tqdm from haystack.nodes import QuestionGenerator, BM25Retriever, FARMReader from haystack.document_stores import ElasticsearchDocumentStore from haystack.pipelines import ( QuestionGenerationPipeline, RetrieverQuestionGenerationPipeline, QuestionAnswerGenerationPipeline, ) from haystack.utils import launch_es, print_questions # Option 2: In Colab / No Docker environments: Start Elasticsearch from source ! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q ! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz ! chown -R daemon:daemon elasticsearch-7.9.2 import os from subprocess import Popen, PIPE, STDOUT es_server = Popen( ["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1) # as daemon ) # wait until ES has started ! sleep 30 !wget https://dl.xpdfreader.com/xpdf-tools-linux-4.04.tar.gz !tar -xvf xpdf-tools-linux-4.04.tar.gz && sudo cp xpdf-tools-linux-4.04/bin64/pdftotext /usr/local/bin from haystack.nodes import TextConverter, PDFToTextConverter, DocxToTextConverter, PreProcessor converter = TextConverter(remove_numeric_tables=True, valid_languages=["nl"]) doc_txt = converter.convert(file_path="/content/data/Chatbot_BVO DDK_22092022.txt", meta=None)[0] from haystack.nodes import PreProcessor # This is a default usage of the PreProcessor. # Here, it performs cleaning of consecutive whitespaces # and splits a single large document into smaller documents. # Each document is up to 1000 words long and document breaks cannot fall in the middle of sentences # Note how the single document passed into the document gets split into 5 smaller documents preprocessor = PreProcessor( clean_empty_lines=True, clean_whitespace=True, clean_header_footer=False, split_by="word", split_length=100, split_respect_sentence_boundary=True, language="nl", ) docs = preprocessor.process([doc_txt]) print(f"n_docs_input: 1\nn_docs_output: {len(docs)}") # Initialize Question Generator question_generator = QuestionGenerator() # Fill the document store with a German document. document_store.write_documents(docs) # Load machine translation models from haystack.nodes import TransformersTranslator in_translator = TransformersTranslator(model_name_or_path="Helsinki-NLP/opus-mt-nl-en") out_translator = TransformersTranslator(model_name_or_path="Helsinki-NLP/opus-mt-en-nl") reader = FARMReader("deepset/roberta-base-squad2") qag_pipeline = QuestionAnswerGenerationPipeline(question_generator, reader) # Wrap the previously defined QuestionAnswerGenerationPipeline from haystack.pipelines import TranslationWrapperPipeline pipeline_with_translation = TranslationWrapperPipeline( input_translator=in_translator, output_translator=out_translator, pipeline=qag_pipeline ) for idx, document in enumerate(tqdm(document_store)): print(f"\n * Generating questions and answers for document {idx}: {document.content[:100]}...\n") result = pipeline_with_translation.run(documents=[document]) print_questions(result)
Comments