new file structure and some changes to the main file
This commit is contained in:
82
bin/webscratching/price_extractor_objectoriented.py
Normal file
82
bin/webscratching/price_extractor_objectoriented.py
Normal file
@@ -0,0 +1,82 @@
|
||||
"""@package docstring
|
||||
This package extracts prices from websites. Currently, the package only allows for extraction of prices from
|
||||
https://digitec.ch/"""
|
||||
|
||||
import requests
|
||||
import csv
|
||||
import datetime
|
||||
|
||||
|
||||
class PriceExtractor:
|
||||
def __init__(self):
|
||||
with open("../../data/prices.csv", "w") as pricedata:
|
||||
self.__writing = csv.writer(pricedata, delimiter=',', quoting=csv.QUOTE_MINIMAL)
|
||||
self.__writing.writerow(["version from", datetime.datetime.now()])
|
||||
self.__imp = open("../../data/products.csv", "r")
|
||||
self.__raw_imp = csv.reader(self.__imp, delimiter=',')
|
||||
self.__raw_list = list(self.__raw_imp)
|
||||
self.__productcount = len(self.__raw_list)
|
||||
print("needing to update", self.__productcount, "prices")
|
||||
self.__productnumber = 0
|
||||
self.__website = ""
|
||||
self.__ingest = ""
|
||||
self.__res = ""
|
||||
self.__priceIdx = 0
|
||||
self.__check = 0
|
||||
self.__request_done = ""
|
||||
self.__raw_price = ""
|
||||
self.__price = 0
|
||||
self.__price_extract = 0
|
||||
|
||||
def readfile(self, filename):
|
||||
"""Reads a new file that contains links in csv format. Arguments:
|
||||
Filename. Either specify full path (e.g. /home/[username]/webscratching/prices.csv), relative path when
|
||||
inside the folder of the executable (e.g. /pricedata/prices2.csv) or inside another folder that is located in
|
||||
the parent folder (e.g. ../pricedata/prices3.csv).
|
||||
Returns the content of the file inside of a list.
|
||||
|
||||
Example: prices = PriceExtractor.readfile(/pricedata/prices2.csv) (NOTE: prices is a list in this case!)"""
|
||||
with open(filename, "w") as pricedata:
|
||||
self.__writing = csv.writer(pricedata, delimiter=',', quoting=csv.QUOTE_MINIMAL)
|
||||
self.__writing.writerow(["version from", datetime.datetime.now()])
|
||||
self.__imp = open("../../data/products.csv", "r")
|
||||
self.__raw_imp = csv.reader(self.__imp, delimiter=',')
|
||||
self.__raw_list = list(self.__raw_imp)
|
||||
self.__productcount = len(self.__raw_list)
|
||||
print("needing to update", self.__productcount, "prices")
|
||||
self.__productnumber = 0
|
||||
return self.__raw_list
|
||||
|
||||
def digitec_extractor(self):
|
||||
"""Run through the entire list of links specified in the csv file that was selected either when loading the
|
||||
function or when specified through the method \"readfile\". NOTE: This method does not require any additional
|
||||
arguments and also does run through the entire file!"""
|
||||
while self.__productnumber < self.__productcount:
|
||||
self.__ingest = self.__raw_list.pop(0)
|
||||
self.__website = self.__ingest.pop(1)
|
||||
self.__productnumber = int(self.__ingest.pop(0))
|
||||
print("fetching data... This step might take a couple of seconds")
|
||||
self.__res = requests.get(self.__website)
|
||||
print("recieved data from", self.__website)
|
||||
self.__check = str(self.__res)
|
||||
if self.__check == "<Response [404]>":
|
||||
print("Ressource unavailable, skipping..")
|
||||
else:
|
||||
self.__request_done = self.__res.text
|
||||
self.__priceIdx = self.__request_done.index('property="product:price:amount')
|
||||
self.__raw_price = self.__request_done[self.__priceIdx + 41:self.__priceIdx + 60]
|
||||
self.__price_extract = ""
|
||||
for buchstabe in self.__raw_price:
|
||||
if buchstabe == "\"":
|
||||
break
|
||||
else:
|
||||
self.__price_extract += buchstabe
|
||||
self.__price = float(self.__price_extract)
|
||||
print("The price is following: ", self.__price, "CHF\n")
|
||||
with open("../../data/prices.csv", "a") as pricedata:
|
||||
writing = csv.writer(pricedata, delimiter=',', quoting=csv.QUOTE_MINIMAL)
|
||||
writing.writerow([self.__productnumber, self.__price])
|
||||
|
||||
|
||||
digitec_ext = PriceExtractor()
|
||||
digitec_ext.digitec_extractor()
|
||||
41
bin/webscratching/top_games.py
Normal file
41
bin/webscratching/top_games.py
Normal file
@@ -0,0 +1,41 @@
|
||||
import bin.lib.website_source_grabber
|
||||
|
||||
|
||||
class TopGamesUpdater:
|
||||
def __init__(self):
|
||||
self.__get_source = ""
|
||||
self.__index = 0
|
||||
self.__extracted = ""
|
||||
self.letter = ""
|
||||
self.__output = ""
|
||||
self.__source = ""
|
||||
self.__go = 1
|
||||
self.__location = 0
|
||||
self.err = ""
|
||||
self.__return_value = []
|
||||
|
||||
def updater(self):
|
||||
self.__source = bin.lib.website_source_grabber.WebsiteSourceGrabber().grabber("https://store.steampowered.com/search/?filter=topsellers")
|
||||
self.list_generator()
|
||||
return self.__return_value
|
||||
|
||||
def list_generator(self):
|
||||
while self.__go == 1:
|
||||
try:
|
||||
self.__index = self.__source[self.__location:].index("<div class=\"col search_name ellipsis\">")
|
||||
self.__index += 80
|
||||
self.__location += self.__index
|
||||
self.__extracted = self.__source[self.__location:self.__location + 120]
|
||||
self.__output = ""
|
||||
for self.letter in self.__extracted:
|
||||
if self.letter == "<":
|
||||
break
|
||||
else:
|
||||
self.__output += self.letter
|
||||
self.__return_value.append(self.__output)
|
||||
|
||||
except ValueError:
|
||||
self.__go = 0
|
||||
|
||||
|
||||
TopGamesUpdater().updater()
|
||||
Reference in New Issue
Block a user