then lists all papers and from the paper page, it downloads the pdfs or abstract.
from lxml import html import requests import urllib.request BASE_URL = 'https://papers.nips.cc/' page = requests.get(BASE_URL) tree = html.fromstring(page.content) books = [ href.attrib['href'] for href in tree.xpath('//a') if 'book' in href.attrib['href']] for book in books: book_page = requests.get(BASE_URL + book) tree = html.fromstring(book_page.content) papers = [ href.attrib['href'] for href in tree.xpath('//a') if 'paper' in href.attrib['href']] for paper in papers: paper_page = requests.get(BASE_URL + paper) tree = html.fromstring(paper_page.content) links = [ href.attrib['href'] for href in tree.xpath('//a') if 'pdf' in href.attrib['href']] for link in links: local = link.split('/')[-1] urllib.request.urlretrieve(BASE_URL + link, local)
No comments:
Post a Comment
Note: Only a member of this blog may post a comment.