max_time = 84600
start_time = time.time()
count=1
while (time.time() - start_time) < max_time:
print("Loop " + str(count) + " startet.")
df = pd.DataFrame()
l=[]
try:
soup = bs.BeautifulSoup(urllib.request.urlopen('https://www.immobilienscout24.de/Suche/S-2/Wohnung-Miete').read(),'lxml')
for paragraph in soup.find_all("a"):
if r"/expose/" in str(paragraph.get("href")):
l.append(paragraph.get("href").split("#")[0])
l = list(set(l))
for item in l:
try:
soup = bs.BeautifulSoup(urllib.request.urlopen('https://www.immobilienscout24.de'+item).read(),'lxml')
data = pd.DataFrame(json.loads(str(soup.find_all("script")).split("keyValues = ")[1].split("}")[0]+str("}")),index=[str(datetime.now())])
data["URL"] = str(item)
beschreibung = []
for i in soup.find_all("pre"):
beschreibung.append(i.text)
data["beschreibung"] = str(beschreibung)
df = df.append(data)
except Exception as e:
print(str(datetime.now())+": " + str(e))
l = list(filter(lambda x: x != item, l))
print("ID " + str(item) + " entfernt.")
df.to_csv(".../Rohdaten/"+str(datetime.now())[:19].replace(":","").replace(".","")+".csv",sep=";",decimal=",",encoding = "utf-8",index_label="timestamp")
print("Loop " + str(count) + " endet.")
count+=1
time.sleep(60)
except Exception as e:
print(str(datetime.now())+": " + str(e))
time.sleep(60)
print("FERTIG!")