101 lines
3.6 KiB
Python
101 lines
3.6 KiB
Python
import scrapy
|
|
from scrapy.spiders import Rule
|
|
from scrapy.linkextractors import LinkExtractor
|
|
from scrapy.crawler import CrawlerProcess
|
|
from w3lib import url as w3url
|
|
|
|
def crawl_mapping():
|
|
mapping = {}
|
|
|
|
class MySpider(scrapy.Spider):
|
|
name = "Crawly"
|
|
allowed_domains = [
|
|
#"minecraft.fandom.com",
|
|
"videospiele.fandom.com"]
|
|
start_urls = [ "https://videospiele.fandom.com/wiki/Videospiele_Wiki" ]
|
|
visits = 0
|
|
lexer = LinkExtractor(allow=r"videospiele\.fandom\.com", deny=([r"\.com/.+/wiki", r"Diskussion:", r"Kategorie:", r"Benutzer:", r"Benutzer_Blog", r"Spezial:"]), restrict_css="main.page__main")
|
|
rules = (
|
|
# Extract links matching 'category.php' (but not matching 'subsection.php')
|
|
# and follow links from them (since no callback means follow=True by default).
|
|
Rule(lexer),
|
|
)
|
|
|
|
#mapping = {}
|
|
|
|
def parse(self, response):
|
|
self.visits += 1
|
|
print(f"{self.visits} Visiting", response.url)
|
|
#links = response.css("a::attr(href)")
|
|
from_url = response.url
|
|
if from_url not in mapping:
|
|
mapping[from_url] = set()
|
|
for link in self.lexer.extract_links(response):
|
|
url = w3url.url_query_cleaner(link.url)
|
|
mapping[from_url].add(url)
|
|
print(f"{from_url} -> {url}")
|
|
yield scrapy.Request(url, self.parse)
|
|
|
|
#for link in self.link_extractor.extract_links(response.css("a::attr(href)")):
|
|
#yield scrapy.Request(link.url, callback=self.parse)
|
|
#for link in links:
|
|
# #print(link)
|
|
# yield response.follow(link, self.parse)
|
|
|
|
class BooksSpider(scrapy.Spider):
|
|
name = "Books"
|
|
|
|
allowed_domains = ["books.toscrape.com"]
|
|
start_urls = ["http://books.toscrape.com/"]
|
|
|
|
def parse(self, response):
|
|
if (response.url.startswith("http://books.toscrape.com/catalogue")):
|
|
for book in response.css("article.product_pod h3"):
|
|
title = book.css("a::text").get()
|
|
yield { "title": title }
|
|
|
|
yield from response.follow_all(response.css("li.next a::attr(href)"), self.parse)
|
|
#yield from response.follow_all(next_pages, self.parse)
|
|
|
|
|
|
class QuotesSpider(scrapy.Spider):
|
|
name = "quotes"
|
|
|
|
def start_requests(self):
|
|
url = "https://quotes.toscrape.com/"
|
|
tag = getattr(self, "tag", None)
|
|
if tag is not None:
|
|
url = url + "tag/" + tag
|
|
yield scrapy.Request(url, self.parse)
|
|
|
|
def parse(self, response):
|
|
for quote in response.css("div.quote"):
|
|
yield {
|
|
"text": quote.css("span.text::text").get(),
|
|
"author": quote.css("small.author::text").get(),
|
|
}
|
|
|
|
next_pages = response.css("li.next a::attr(href)")
|
|
yield from response.follow_all(next_pages, self.parse)
|
|
#next_page = response.css("li.next a::attr(href)").get()
|
|
#if next_page is not None:
|
|
# yield response.follow(next_page, self.parse)
|
|
|
|
process = CrawlerProcess(
|
|
settings={
|
|
"FEEDS": {
|
|
"items.json": {"format": "json"},
|
|
},
|
|
}
|
|
)
|
|
|
|
process.crawl(MySpider)
|
|
process.start() # the script will block here until the crawling is finished
|
|
return mapping
|
|
|
|
mapping = crawl_mapping()
|
|
mapping_list = { k: list(v) for (k, v) in mapping.items()}
|
|
import json
|
|
rep = json.dumps(mapping_list)
|
|
with open("videospiele.json", "w") as f:
|
|
f.writelines(rep) |