8.2. Scraper para a Biblioteca Digital de Portugal.#
Coletar dados e metadados da Biblioteca Nacional Digital Portuguesa (BNDP) utilizando selenium.
# import libraries
import time
import json
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
# define search keyword
keyword = input("Enter the keyword you want to search for: ")
search_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# Definição das opções do driver
chrome_options = Options()
chrome_options.add_argument("--headless=new")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36")
driver = webdriver.Chrome(options=chrome_options)
# base url
search_url = f'https://bndigital.bnportugal.gov.pt/records?navigation=default&perpage=1000&page=1&search={keyword}&fulltext=1&child=1&bookmarks=1&sort=_score&refine%5BDocumentType%5D%5B%5D=Publica%C3%A7%C3%B5es+Peri%C3%B3dicas#page'
# Passa a url para o driver
driver.get(search_url)
# find class=returned_results
results = driver.find_element(By.CLASS_NAME, 'returned_results').text
results = int(''.join(filter(str.isdigit, results)))
results
22
# dict with search info
search_info = {
'keyword': keyword,
'search_time': search_time,
'results': results,
'url': search_url
}
def data_dict(driver, id, dict_):
# CHECK IF ELEMENT EXISTS BEFORE GETTING IT
try:
element = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.ID, id))
)
except:
element = None
if element:
data = driver.find_element(By.ID, id)
data = data.find_element(By.TAG_NAME, 'ul')
data = data.find_elements(By.TAG_NAME, 'li')
for d in data:
d = d.find_element(By.TAG_NAME, 'a')
d_name = d.find_element(By.TAG_NAME, 'div').text
d_count = d.find_element(By.TAG_NAME, 'span').text
dict_[d_name] = d_count
else:
dict_ = {}
# função para metadados
def get_metadata(driver):
rights_count = {}
languages_count = {}
countries_count = {}
subjects_count = {}
facets = driver.find_element(By.CSS_SELECTOR,'.show_facets')
rights = data_dict(facets, 'facet_Rights', rights_count)
languages = data_dict(facets, 'facet_Language', languages_count)
countries = data_dict(facets, 'facet_locationcountry', countries_count)
subjects = data_dict(facets, 'facet_Subject', subjects_count)
return rights_count, languages_count, countries_count, subjects_count
# function to get data from each result
def get_np_data(driver):
info_general = driver.find_element(By.CSS_SELECTOR, 'div.page_content:nth-child(1)')
try:
dates_box = info_general.find_element(By.CSS_SELECTOR, '#arbo')
dates = dates_box.find_elements(By.TAG_NAME, 'li')
# tag 'data-date' name of the date
dates_list = []
for d in dates:
dates_list.append(d.get_attribute('data-date'))
dates_list = list(filter(None, dates_list))
dates_list = [d for d in dates_list if d != '']
except:
dates_list = []
try:
id_per = info_general.find_element(By.CSS_SELECTOR, '#small_info > a:nth-child(4)')
id_per = id_per.get_attribute('href')
except:
id_per = None
try:
link_per = info_general.find_element(By.CSS_SELECTOR, '.copy_permalink')
link_per = link_per.get_attribute('data-url')
except:
link_per = None
try:
link_cat = info_general.find_element(By.CSS_SELECTOR, '.one-third > a:nth-child(1)')
link_cat = link_cat.get_attribute('href')
except:
link_cat = None
return dates_list, id_per, link_per, link_cat
def get_data(driver):
data = []
while True:
navlist = driver.find_elements(By.CLASS_NAME, 'navlist_tr')
for i in range(len(navlist)):
nav = navlist[i]
title = nav.find_element(By.TAG_NAME, 'h3').text
link = nav.find_element(By.TAG_NAME, 'a').get_attribute('href')
# append data to list
data.append({'title': title, 'link': link})
#click on link and opne in new tab
nav.find_element(By.TAG_NAME, 'a').send_keys(Keys.CONTROL + Keys.RETURN)
# wait for page to load
time.sleep(2)
# switch to new tab
driver.switch_to.window(driver.window_handles[1])
# call function to get data from each result
dates_list, id_per, link_per, link_cat = get_np_data(driver)
# append data to list
data.append({'dates': dates_list, 'id_per': id_per, 'link_per': link_per, 'link_cat': link_cat})
# back to previous tab and close current tab
driver.close()
driver.switch_to.window(driver.window_handles[0])
if i == len(navlist) - 1:
break
else:
continue
break
return data
#call function
metadata_general = get_metadata(driver)
results_list = get_data(driver)
driver.quit()
# add results_list to metadata_general
metadata_general = list(metadata_general)
metadata_general.append(search_info)
metadata_general.append(results_list)
#move item in position 4 to position 0
metadata_general.insert(0, metadata_general.pop(4))
metadata_general = tuple(metadata_general)
# criar um json com os dados
headers = ['Informações da Busca', 'Direitos', 'Idiomas', 'Países', 'Assuntos', 'Lista de Resultados']
with open(f'./data/metadata_general_{keyword}_{search_time}.json', 'w', encoding='utf-8') as f:
if len(metadata_general) > 0:
data = {}
for i, header in enumerate(headers):
data[header] = metadata_general[i]
json.dump(data, f, ensure_ascii=False, indent=4)