Skip to content Skip to sidebar Skip to footer

Drop Down To The Bottom Of The Page And Get Urls For Each Entry In Python

With help from the first link here, I'm able to crawler one single page's desired contents. Next step I would like come back parent page and drop down to the bottom of the page, th

Solution 1:

There's a totalPage value in the response, so you can use this to loop over the pages and grab the data and dump it all to a df.

Here's how:

import json

import pandas as pd
import requests

headers = {
    'Connection': 'keep-alive',
    'sec-ch-ua': '"Google Chrome";v="89", "Chromium";v="89", ";Not A Brand";v="99"',
    'Accept': 'application/json, text/plain, */*',
    'DNT': '1',
    'X-Requested-With': 'XMLHttpRequest',
    'sec-ch-ua-mobile': '?0',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36',
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'Origin': 'https://www.cspea.com.cn',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Dest': 'empty',
    'Referer': 'https://www.cspea.com.cn/list?c=C01&i=1&p=5000-999999999&s=A06,A07',
    'Accept-Language': 'zh-CN,zh;q=0.9',
}

data = {
    'filter_projectText': '',
    'filter_projectClassifyCode': 'C01',
    'filter_tagCode': '',
    'filter_industryCode': '',
    'filter_industryCodeTwo': '',
    'filter_projectType': '1',
    'filter_tradInstitutionId': '',
    'filter_zone': '',
    'filter_groupZone': '',
    'filter_minPrice': '',
    'filter_maxPrice': '',
    'filter_minTradeValue': '5000',
    'filter_maxTradeValue': '999999999',
    'filter_minPercent': '',
    'filter_maxPercent': '',
    'filter_startDate': '',
    'filter_endDate': '',
    'filter_startTradeDate': '',
    'filter_endTradeDate': '',
    'filter_startPreDate': '',
    'filter_endPreDate': '',
    'filter_businessStatus': 'A06,A07',
    'filter_isGz': '',
    'filter_isHot': '',
    'filter_publishDateSort': 'desc',
    'filter_projectPriceSort': '',
    'filter_tradeValueSort': '',
    'filter_startExpireDate': '',
    'filter_endExpireDate': '',
    'pageIndex': '1',
    'pageSize': '12',
    'sysCode': '1',
    'filter_textDateSort': ''
}

end_point = 'https://www.cspea.com.cn/proxy/projectInterface/project/searchIndex'
df_container = []
with requests.Session() as connection:
    response = connection.post(end_point, headers=headers, data=data, verify=False)
    last_page = int(response.json()["result"]["totalPage"]) + 1for page inrange(1, last_page):
        data["pageIndex"] = str(page)
        print(f"Going through page {page}...")
        r = connection.post(end_point, headers=headers, data=data, verify=False).json()
        df_container.extend(json.loads(r["result"]["data"]))

df = pd.DataFrame(df_container)

cols = ['publishDate', 'expireDate', 'tradeDate']
df[cols] = df[cols].apply(lambda x: pd.to_datetime(x, unit='ms').dt.date)

df.to_csv("items_scraped.csv", index=False)

Output:

A .csv file with 254 entries. You can drop any columns you want and reshape the df to your liking.

enter image description here

Post a Comment for "Drop Down To The Bottom Of The Page And Get Urls For Each Entry In Python"