then lists all papers and from the paper page, it downloads the pdfs or abstract.
from lxml import html
import requests
import urllib.request
BASE_URL = 'https://papers.nips.cc/'
page = requests.get(BASE_URL)
tree = html.fromstring(page.content)
books = [ href.attrib['href'] for href in tree.xpath('//a') if 'book' in href.attrib['href']]
for book in books:
book_page = requests.get(BASE_URL + book)
tree = html.fromstring(book_page.content)
papers = [ href.attrib['href'] for href in tree.xpath('//a') if 'paper' in href.attrib['href']]
for paper in papers:
paper_page = requests.get(BASE_URL + paper)
tree = html.fromstring(paper_page.content)
links = [ href.attrib['href'] for href in tree.xpath('//a') if 'pdf' in href.attrib['href']]
for link in links:
local = link.split('/')[-1]
urllib.request.urlretrieve(BASE_URL + link, local)
No comments:
Post a Comment
Note: Only a member of this blog may post a comment.