In this post I am going to show you how to build your own answer finding system with Python. Basically, this automation can find the answer of multiple-choice question from the picture.
Vision AI
Check the documentation to enable and set up the API. After configuration, you have to create JSON file that contains your key downloads to your computer.
pip install google-cloud-vision
import os, io
from google.cloud import vision
from google.cloud.vision import types
# JSON file that contains your key
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'your_private_key.json'
# Instantiates a client
client = vision.ImageAnnotatorClient()
FILE_NAME = 'your_image_file.jpg'
# Loads the image into memory
with io.open(os.path.join(FILE_NAME), 'rb') as image_file:
content = image_file.read()
image = vision.types.Image(content=content)
# Performs text detection on the image file
response = client.text_detection(image=image)
print(response)
# Extract description
texts = response.text_annotations[0]
print(texts.description)
Search Question on Google
import re
import urllib
# If ending with question mark
if '?' in texts.description:
question = re.search('([^?]+)', texts.description).group(1)
# If ending with colon
elif ':' in texts.description:
question = re.search('([^:]+)', texts.description).group(1)
# If ending with newline
elif '\n' in texts.description:
question = re.search('([^\n]+)', texts.description).group(1)
# Slugify the match
slugify_keyword = urllib.parse.quote_plus(question)
print(slugify_keyword)
Crawl the Information
/url?q=https://en.wikipedia.org/wiki/IAU_definition_of_planet&sa=U&ved=2ahUKEwiSmtrEsaTnAhXtwsQBHduCCO4QFjAAegQIBBAB&usg=AOvVaw0HzMKrBxdHZj5u1Yq1t0en
result_urls = []
def crawl_result_urls():
req = Request('https://google.com/search?q=' + slugify_keyword, headers={'User-Agent': 'Mozilla/5.0'})
html = urlopen(req).read()
bs = BeautifulSoup(html, 'html.parser')
results = bs.find_all('div', class_='ZINbbc')
try:
for result in results:
link = result.find('a')['href']
# Checking if it is url (in case)
if 'url' in link:
result_urls.append(re.search('q=(.*)&sa', link).group(1))
except (AttributeError, IndexError) as e:
pass
Question Answering System
pip install cdqa
import pandas as pd
from ast import literal_eval
from cdqa.utils.filters import filter_paragraphs
from cdqa.utils.download import download_model, download_bnpp_data
from cdqa.pipeline.cdqa_sklearn import QAPipeline
# Download data and models
download_bnpp_data(dir='./data/bnpp_newsroom_v1.1/')
download_model(model='bert-squad_1.1', dir='./models')
# Loading data and filtering / preprocessing the documents
df = pd.read_csv('data/bnpp_newsroom_v1.1/bnpp_newsroom-v1.1.csv', converters={'paragraphs': literal_eval})
df = filter_paragraphs(df)
# Loading QAPipeline with CPU version of BERT Reader pretrained on SQuAD 1.1
cdqa_pipeline = QAPipeline(reader='models/bert_qa.joblib')
# Fitting the retriever to the list of documents in the dataframe
cdqa_pipeline.fit_retriever(df)
# Sending a question to the pipeline and getting prediction
query = 'Since when does the Excellence Program of BNP Paribas exist?'
prediction = cdqa_pipeline.predict(query)
print('query: {}\n'.format(query))
print('answer: {}\n'.format(prediction[0]))
print('title: {}\n'.format(prediction[1]))
print('paragraph: {}\n'.format(prediction[2]))
which is basically a pre-trained Deep Learning model. The model used was the Pytorch version of the well known NLP model BERT.
question.
Here is the schema of the system mechanism.
directory of PDF files. So, I am going to save all crawled data in pdf file for each result. Hopefully, we will have 3 pdf files in total (can be 1 or 2 as well). Additonaly, we need to name these pdf files that's why I crawled the heading of each page.
def get_result_details(url):
try:
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
html = urlopen(req).read()
bs = BeautifulSoup(html, 'html.parser')
try:
# Crawl any heading in result to name pdf file
title = bs.find(re.compile('^h[1-6]$')).get_text().strip().replace('?', '').lower()
# Naming the pdf file
filename = "/home/coderasha/autoans/pdfs/" + title + ".pdf"
if not os.path.exists(os.path.dirname(filename)):
try:
os.makedirs(os.path.dirname(filename))
except OSError as exc: # Guard against race condition
if exc.errno != errno.EEXIST:
raise
with open(filename, 'w') as f:
# Crawl first 5 paragraphs
for line in bs.find_all('p')[:5]:
f.write(line.text + '\n')
except AttributeError:
pass
except urllib.error.HTTPError:
pass
def find_answer():
df = pdf_converter(directory_path='/home/coderasha/autoans/pdfs')
cdqa_pipeline = QAPipeline(reader='models/bert_qa.joblib')
cdqa_pipeline.fit_retriever(df)
query = question + '?'
prediction = cdqa_pipeline.predict(query)
print('query: {}\n'.format(query))
print('answer: {}\n'.format(prediction[0]))
print('title: {}\n'.format(prediction[1]))
print('paragraph: {}\n'.format(prediction[2]))
return prediction[0]
import os, io
import errno
import urllib
import urllib.request
import hashlib
import re
import requests
from time import sleep
from google.cloud import vision
from google.cloud.vision import types
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import pandas as pd
from ast import literal_eval
from cdqa.utils.filters import filter_paragraphs
from cdqa.utils.download import download_model, download_bnpp_data
from cdqa.pipeline.cdqa_sklearn import QAPipeline
from cdqa.utils.converters import pdf_converter
result_urls = []
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'your_private_key.json'
client = vision.ImageAnnotatorClient()
FILE_NAME = 'your_image_file.jpg'
with io.open(os.path.join(FILE_NAME), 'rb') as image_file:
content = image_file.read()
image = vision.types.Image(content=content)
response = client.text_detection(image=image)
texts = response.text_annotations[0]
# print(texts.description)
if '?' in texts.description:
question = re.search('([^?]+)', texts.description).group(1)
elif ':' in texts.description:
question = re.search('([^:]+)', texts.description).group(1)
elif '\n' in texts.description:
question = re.search('([^\n]+)', texts.description).group(1)
slugify_keyword = urllib.parse.quote_plus(question)
# print(slugify_keyword)
def crawl_result_urls():
req = Request('https://google.com/search?q=' + slugify_keyword, headers={'User-Agent': 'Mozilla/5.0'})
html = urlopen(req).read()
bs = BeautifulSoup(html, 'html.parser')
results = bs.find_all('div', class_='ZINbbc')
try:
for result in results:
link = result.find('a')['href']
print(link)
if 'url' in link:
result_urls.append(re.search('q=(.*)&sa', link).group(1))
except (AttributeError, IndexError) as e:
pass
def get_result_details(url):
try:
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
html = urlopen(req).read()
bs = BeautifulSoup(html, 'html.parser')
try:
title = bs.find(re.compile('^h[1-6]$')).get_text().strip().replace('?', '').lower()
# Set your path to pdf directory
filename = "/path/to/pdf_folder/" + title + ".pdf"
if not os.path.exists(os.path.dirname(filename)):
try:
os.makedirs(os.path.dirname(filename))
except OSError as exc:
if exc.errno != errno.EEXIST:
raise
with open(filename, 'w') as f:
for line in bs.find_all('p')[:5]:
f.write(line.text + '\n')
except AttributeError:
pass
except urllib.error.HTTPError:
pass
def find_answer():
# Set your path to pdf directory
df = pdf_converter(directory_path='/path/to/pdf_folder/')
cdqa_pipeline = QAPipeline(reader='models/bert_qa.joblib')
cdqa_pipeline.fit_retriever(df)
query = question + '?'
prediction = cdqa_pipeline.predict(query)
# print('query: {}\n'.format(query))
# print('answer: {}\n'.format(prediction[0]))
# print('title: {}\n'.format(prediction[1]))
# print('paragraph: {}\n'.format(prediction[2]))
return prediction[0]
crawl_result_urls()
for url in result_urls[:3]:
get_result_details(url)
sleep(5)
answer = find_answer()
print('Answer: ' + answer)
Check Reverse Python for more cool content.