8.2. Scraper para a Biblioteca Digital de Portugal.#

Coletar dados e metadados da Biblioteca Nacional Digital Portuguesa (BNDP) utilizando selenium.

# import libraries
import time
import json
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
# define search keyword
keyword = input("Enter the keyword you want to search for: ")
search_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# Definição das opções do driver
chrome_options = Options()
chrome_options.add_argument("--headless=new")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36")
driver = webdriver.Chrome(options=chrome_options)
# base url
search_url = f'https://bndigital.bnportugal.gov.pt/records?navigation=default&perpage=1000&page=1&search={keyword}&fulltext=1&child=1&bookmarks=1&sort=_score&refine%5BDocumentType%5D%5B%5D=Publica%C3%A7%C3%B5es+Peri%C3%B3dicas#page'
# Passa a url para o driver
driver.get(search_url)
# find class=returned_results
results = driver.find_element(By.CLASS_NAME, 'returned_results').text
results = int(''.join(filter(str.isdigit, results)))
results
22
# dict with search info
search_info = {
    'keyword': keyword,
    'search_time': search_time,
    'results': results,
    'url': search_url
}
def data_dict(driver, id, dict_):
    # CHECK IF ELEMENT EXISTS BEFORE GETTING IT
    try: 
        element = WebDriverWait(driver, 5).until(
        EC.presence_of_element_located((By.ID, id))
    )
    except:
        element = None
    if element:
        data = driver.find_element(By.ID, id)
        data = data.find_element(By.TAG_NAME, 'ul')
        data = data.find_elements(By.TAG_NAME, 'li')
        for d in data:
            d = d.find_element(By.TAG_NAME, 'a')
            d_name = d.find_element(By.TAG_NAME, 'div').text
            d_count = d.find_element(By.TAG_NAME, 'span').text
            dict_[d_name] = d_count
    else:
        dict_ = {}
# função para metadados
def get_metadata(driver):
    rights_count = {}
    languages_count = {}
    countries_count = {}
    subjects_count = {}
    facets = driver.find_element(By.CSS_SELECTOR,'.show_facets')
    rights = data_dict(facets, 'facet_Rights', rights_count)
    languages = data_dict(facets, 'facet_Language', languages_count)
    countries = data_dict(facets, 'facet_locationcountry', countries_count)
    subjects = data_dict(facets, 'facet_Subject', subjects_count)
    return rights_count, languages_count, countries_count, subjects_count
# function to get data from each result
def get_np_data(driver):
    info_general = driver.find_element(By.CSS_SELECTOR, 'div.page_content:nth-child(1)')
    try:
        dates_box = info_general.find_element(By.CSS_SELECTOR, '#arbo')
        dates = dates_box.find_elements(By.TAG_NAME, 'li')
        # tag 'data-date' name of the date
        dates_list = []
        for d in dates:
            dates_list.append(d.get_attribute('data-date'))
            dates_list = list(filter(None, dates_list))
            dates_list = [d for d in dates_list if d != '']
    except:
        dates_list = []
    try: 
        id_per = info_general.find_element(By.CSS_SELECTOR, '#small_info > a:nth-child(4)')
        id_per = id_per.get_attribute('href')
    except:
        id_per = None
    try: 
        link_per = info_general.find_element(By.CSS_SELECTOR, '.copy_permalink')
        link_per = link_per.get_attribute('data-url')
    except:
        link_per = None
    try:
        link_cat = info_general.find_element(By.CSS_SELECTOR, '.one-third > a:nth-child(1)')
        link_cat = link_cat.get_attribute('href')
    except:
        link_cat = None
    return dates_list, id_per, link_per, link_cat
def get_data(driver):
    data = []
    while True:
        navlist = driver.find_elements(By.CLASS_NAME, 'navlist_tr')
        for i in range(len(navlist)):
            nav = navlist[i]
            title = nav.find_element(By.TAG_NAME, 'h3').text
            link = nav.find_element(By.TAG_NAME, 'a').get_attribute('href')
            # append data to list
            data.append({'title': title, 'link': link})
            #click on link and opne in new tab
            nav.find_element(By.TAG_NAME, 'a').send_keys(Keys.CONTROL + Keys.RETURN)
            # wait for page to load
            time.sleep(2)
            # switch to new tab
            driver.switch_to.window(driver.window_handles[1])
            # call function to get data from each result
            dates_list, id_per, link_per, link_cat = get_np_data(driver)
            # append data to list
            data.append({'dates': dates_list, 'id_per': id_per, 'link_per': link_per, 'link_cat': link_cat})
            # back to previous tab and close current tab
            driver.close()
            driver.switch_to.window(driver.window_handles[0])
            if i == len(navlist) - 1:
                break
        else:
            continue
        break
    return data
#call function
metadata_general = get_metadata(driver)
results_list = get_data(driver)
driver.quit()
# add results_list to metadata_general
metadata_general = list(metadata_general)
metadata_general.append(search_info)
metadata_general.append(results_list)
#move item in position 4 to position 0
metadata_general.insert(0, metadata_general.pop(4))
metadata_general = tuple(metadata_general)
# criar um json com os dados
headers = ['Informações da Busca', 'Direitos', 'Idiomas', 'Países', 'Assuntos', 'Lista de Resultados']
with open(f'./data/metadata_general_{keyword}_{search_time}.json', 'w', encoding='utf-8') as f:
    if len(metadata_general) > 0:
        data = {}
        for i, header in enumerate(headers):
            data[header] = metadata_general[i]
        json.dump(data, f, ensure_ascii=False, indent=4)