forschungstage-2023/crawler/crawler.py
2023-06-14 11:00:06 +02:00

101 lines
3.6 KiB
Python

import scrapy
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess
from w3lib import url as w3url
def crawl_mapping():
mapping = {}
class MySpider(scrapy.Spider):
name = "Crawly"
allowed_domains = [
#"minecraft.fandom.com",
"videospiele.fandom.com"]
start_urls = [ "https://videospiele.fandom.com/wiki/Videospiele_Wiki" ]
visits = 0
lexer = LinkExtractor(allow=r"videospiele\.fandom\.com", deny=([r"\.com/.+/wiki", r"Diskussion:", r"Kategorie:", r"Benutzer:", r"Benutzer_Blog", r"Spezial:"]), restrict_css="main.page__main")
rules = (
# Extract links matching 'category.php' (but not matching 'subsection.php')
# and follow links from them (since no callback means follow=True by default).
Rule(lexer),
)
#mapping = {}
def parse(self, response):
self.visits += 1
print(f"{self.visits} Visiting", response.url)
#links = response.css("a::attr(href)")
from_url = response.url
if from_url not in mapping:
mapping[from_url] = set()
for link in self.lexer.extract_links(response):
url = w3url.url_query_cleaner(link.url)
mapping[from_url].add(url)
print(f"{from_url} -> {url}")
yield scrapy.Request(url, self.parse)
#for link in self.link_extractor.extract_links(response.css("a::attr(href)")):
#yield scrapy.Request(link.url, callback=self.parse)
#for link in links:
# #print(link)
# yield response.follow(link, self.parse)
class BooksSpider(scrapy.Spider):
name = "Books"
allowed_domains = ["books.toscrape.com"]
start_urls = ["http://books.toscrape.com/"]
def parse(self, response):
if (response.url.startswith("http://books.toscrape.com/catalogue")):
for book in response.css("article.product_pod h3"):
title = book.css("a::text").get()
yield { "title": title }
yield from response.follow_all(response.css("li.next a::attr(href)"), self.parse)
#yield from response.follow_all(next_pages, self.parse)
class QuotesSpider(scrapy.Spider):
name = "quotes"
def start_requests(self):
url = "https://quotes.toscrape.com/"
tag = getattr(self, "tag", None)
if tag is not None:
url = url + "tag/" + tag
yield scrapy.Request(url, self.parse)
def parse(self, response):
for quote in response.css("div.quote"):
yield {
"text": quote.css("span.text::text").get(),
"author": quote.css("small.author::text").get(),
}
next_pages = response.css("li.next a::attr(href)")
yield from response.follow_all(next_pages, self.parse)
#next_page = response.css("li.next a::attr(href)").get()
#if next_page is not None:
# yield response.follow(next_page, self.parse)
process = CrawlerProcess(
settings={
"FEEDS": {
"items.json": {"format": "json"},
},
}
)
process.crawl(MySpider)
process.start() # the script will block here until the crawling is finished
return mapping
mapping = crawl_mapping()
mapping_list = { k: list(v) for (k, v) in mapping.items()}
import json
rep = json.dumps(mapping_list)
with open("videospiele.json", "w") as f:
f.writelines(rep)