You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
auto/spider/spider_get_and_check_ssq.py

94 lines
3.4 KiB

# -*-coding: utf-8 -*-
import datetime
import os
import sqlite3
from selenium import webdriver
import httpx
def get_cookies(url):
chrome_options = webdriver.ChromeOptions()
args = ['--headless', '--no-sandbox', '--disable-gpu', '--disable-dev-shm-usage']
for arg in args:
chrome_options.add_argument(arg)
driver = webdriver.Chrome(options=chrome_options)
driver.get(url)
result_cookie = driver.get_cookies()
if result_cookie:
return result_cookie
else:
pass
def req(url, cookies):
with httpx.Client() as client:
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
"Connection": "keep-alive",
"Cookie": cookies,
"Host": "www.cwl.gov.cn",
"User-Agent": "Mozilla/5.0"
}
res = client.get(url, headers=headers, follow_redirects=True)
if res.status_code != 200:
print(res.status_code)
log_file_path = os.path.join(get_path.get_logs_path(), str(datetime.date.today()) + '.log')
with open(log_file_path, 'a') as f:
f.write("\n spider_dlt: %s")
return
res_json = res.json()
data_handle(res_json['result'])
def data_handle(source_data):
ssq_db_path = os.path.join(utils_get_path.get_db_path(), 'ssq.db')
conn = sqlite3.connect(ssq_db_path)
c = conn.cursor()
c.execute('drop table if exists data;')
c.execute(
'create table if not exists `ssq` (id INT PRIMARY KEY NOT NULL, `code` varchar(10),`red1` varchar(2),`red2` varchar(2),`red3` varchar(2),`red4` varchar(2),`red5` varchar(2),`red6` varchar(2),`blue` varchar(2),`date` varchar(12),`sales` varchar(15),`poolmoney` varchar(15),`content` varchar(255));')
id = 1
for data in source_data:
insert_sql = "INSERT INTO ssq ('id', 'code', 'red1', 'red2', 'red3', 'red4', 'red5', 'red6', 'blue', 'date', 'sales', 'poolmoney', 'content') VALUES ({0}, '{1}', '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}', '{10}', '{11}', '{12}')".format(
id,
data.setdefault('code'),
data.setdefault('red').split(',')[0],
data.setdefault('red').split(',')[1],
data.setdefault('red').split(',')[2],
data.setdefault('red').split(',')[3],
data.setdefault('red').split(',')[4],
data.setdefault('red').split(',')[5],
data.setdefault('blue'),
data.setdefault('date'),
data.setdefault('sales'),
data.setdefault('poolmoney'),
data.setdefault('content')
)
c.execute(insert_sql)
conn.commit()
id += 1
conn.close()
if __name__ == "__main__":
url = 'http://www.cwl.gov.cn/cwl_admin/front/cwlkj/search/kjxx/findDrawNotice?name=ssq&issueCount=&issueStart=&issueEnd=&dayStart=&dayEnd=&pageNo=1&pageSize=10&week=&systemType=PC'
# result_cookie = util_get_cookies.get_cookies(url)
#
# cookies = '{}={}'.format(result_cookie[0].setdefault('name'), result_cookie[0].setdefault('value'))
#
# print(cookies)
# 测试时使用的 cookies
cookies = "HMF_CI=1b2fd73192f2054a429b2bfa4f58c3ff98119441420133cc8a04ca9c95aa2266eaec5bb7cf1d37df5f9864b8629ba407bacc9c58cadf26e2d726582df3870b0969"
req(url, cookies)