Scan website and extract data using Python
from bs4 import BeautifulSoup import requests Sources=[ { "URL":"https://k-tb.com/books/disc/تاريخ?page=", "Pages":100, "FT_URL":"https://archive.org/download/history", #'https://archive.org/download/history10000/history09907.zip' "start":7, "end":9 }, { "URL":"https://k-tb.com/books/disc/الزهد-والرقائق?page=", "Pages":27, "FT_URL":"https://archive.org/download/tarbyah", #'https://archive.org/download/tarbyah3000/tarbyah02693.zip' "start":7, "end":9 }, { "URL":"https://k-tb.com/books/disc/الحديث-وعلومه?page=", "Pages":87, "FT_URL":"https://archive.org/download/hadeeth", #'https://archive.org/download/hadeeth9000/hadeeth8697.zip' "start":7, "end":8 }, { "URL":"https://k-tb.com/books/disc/التفسير-وعلوم-القرآن?page=", "Pages":120, "FT_URL":"https://archive.org/download/Quran", #'https://archive.org/download/Quran12000/Quraan11975.zip' "start":6, "end":8 }, { "URL":"https://k-tb.com/books/disc/العقيدة-والمذاهب-والأديان?page=", "Pages":92, "FT_URL":"https://archive.org/download/aqidah01/" #https://archive.org/download/aqidah01/Aqidah09200.zip }, { "URL":"https://k-tb.com/books/disc/الدعوة-والاحتساب?page=", "Pages":10, "FT_URL":"https://archive.org/download/dawah1000/" #https://archive.org/download/dawah1000/dawah00927.zip }, { "URL":"https://k-tb.com/books/disc/الثقافة-الإسلامية-?page=", "Pages":6, "FT_URL":"https://archive.org/download/Th2000/" #'https://archive.org/download/Th2000/Th1898.zip' }, { "URL":"https://k-tb.com/books/disc/السيرة-النبوية?page=", "Pages":11, "FT_URL":"https://archive.org/download/serah1000/" #https://archive.org/download/serah1000/serah01055.zip }, { "URL":"https://k-tb.com/books/disc/دوريات-ومجلات?page=", "Pages":4, "FT_URL":"https://archive.org/download/magazine1000/" #https://archive.org/download/magazine1000/magazine0003.zip }, ] for item in Sources for i in range(1,item["Pages"]) r = requests.get(item["URL"] + str(i)) html = r.text soup = BeautifulSoup(html, "lxml") table = soup.find("table",{"class":"table-hover"}) rows = table.find_all('tr', recursive=False) for row in rows: cell = row.find_all(['td'], recursive=False) if cell: ID= cell[0].string Title= cell[1].string Author=cell[2].string OriginalURL=cell[3].find('a').get('href') if item["FT_URL"][-1]=="/": FT=item["FT_URL"]+""+ID+".zip" else FT=item["FT_URL"]+str(int(ID[item["start"]:end["start"]])+1)+"000/"+ID+".zip" print(ID) print(Title) print(Author) print(FT)
No comments:
Post a Comment