added indeed suport
start extracting date from "vor X days" where time = today-X
This commit is contained in:
@@ -4,31 +4,43 @@ DEBUG = True
|
||||
def log(*s):
|
||||
if DEBUG:
|
||||
print(s)
|
||||
def indeed_com(url,session):
|
||||
|
||||
def scrap_indeed_com(url,entry,session):
|
||||
log("[scrap_indeed_com] url: ",url)
|
||||
jobs = []
|
||||
if(session == 0):
|
||||
with requests.Session() as session:
|
||||
page = session.get(url)
|
||||
log(page)
|
||||
else:
|
||||
# if(session == 0):
|
||||
with requests.Session() as session:
|
||||
session.headers = {
|
||||
"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64; rv:127.0) Gecko/20100101 Firefox/127.0"
|
||||
}
|
||||
page = session.get(url)
|
||||
log(page)
|
||||
# else:
|
||||
# page = session.get(url)
|
||||
# log(page)
|
||||
soup = BeautifulSoup(page.content,"html.parser")
|
||||
#print(soup.prettify())
|
||||
|
||||
results = soup.find_all("li",class_= 'css-5lfssm eu4oa1w0')
|
||||
|
||||
results = soup.find_all("li",class_= 'css-5lfssm eu4oa1w0') #top level list element
|
||||
|
||||
location = item("p",{'data-testid':'text-location'},0)
|
||||
ar_location = finder(results,location,LOCATION_CLEANUP=1,ATTRS=1)
|
||||
location = item("div",{'data-testid':'text-location'},0,"indeed location")
|
||||
ar_location = finder(results,location,ATTRS=1,LOCATION_CLEANUP=1)
|
||||
|
||||
company = item("p",{'data-testid':'company-name'},0)
|
||||
ar_company = finder(results,location,ATTRS=1)
|
||||
company = item("span",{'data-testid':'company-name'},0,"indeed company")
|
||||
ar_company = finder(results,company,ATTRS=1)
|
||||
|
||||
title = item("a",'jobTitle',0)
|
||||
ar_title = finder(results,location)
|
||||
title = item("a",'jcs-JobTitle',0,"indeed title")
|
||||
ar_title = finder(results,title,GETCHILDREN="span")
|
||||
|
||||
date = item("span","Span-sc-1ybanni-0 Text__span-sc-1lu7urs-12 Text-sc-1lu7urs-13 krGudM hUhFmL",0)
|
||||
ar_date = finder(results,date,CLEANDATE=1)
|
||||
date = item("span",{'data-testid':'myJobsStateDate'},0,"indeed date")
|
||||
ar_date = finder(results,date,ATTRS=1)
|
||||
|
||||
link = item("a",'jcs-JobTitle',0,"link")
|
||||
ar_link = finder(results,link,LINK=1,BASEURL="https://ch.indeed.com")
|
||||
|
||||
tag = entry.tag#get from config
|
||||
return arrayToClass(ar_title,ar_company,ar_location,ar_date,ar_link,tag)
|
||||
|
||||
def scrap_jobs(url,entry,session):
|
||||
jobs = []
|
||||
@@ -64,6 +76,28 @@ def scrap_jobs(url,entry,session):
|
||||
tag = entry.tag#get from config
|
||||
return arrayToClass(ar_title,ar_company,ar_location,ar_date,ar_link,tag)
|
||||
|
||||
def next_url_indeed_com(url,session,baseurl):
|
||||
next_link_str = ''
|
||||
if(session == 0):
|
||||
with requests.Session() as session:
|
||||
page = session.get(url)
|
||||
else:
|
||||
page = requests.get(url)
|
||||
soup = BeautifulSoup(page.content,"html.parser")
|
||||
result_next = soup.findAll("nav",attrs={"role":"navigation"})
|
||||
next_=item("a",{'data-testid':'pagination-page-next'},0)
|
||||
next_link = finder(result_next,next_,ATTRS=1,LINK=1)
|
||||
if next_link:
|
||||
if(next_link[0] != "NOTFound"):
|
||||
next_link_str = str(next_link[0])
|
||||
next_link_str = baseurl + next_link_str
|
||||
log(next_link_str)
|
||||
else:
|
||||
return 0
|
||||
if next_link_str != '':
|
||||
return next_link_str
|
||||
else:
|
||||
return 0
|
||||
def next_url_jobs_ch(url,session,baseurl):
|
||||
next_link_str = ''
|
||||
if(session == 0):
|
||||
|
||||
Reference in New Issue
Block a user