Skip to content

utils.web

Utils - web¤

get_page_content(link, session=None, session_query_configs=None, method='GET', data=None) ¤

Download page and save content

Parameters:

Name Type Description Default
link str

link to get content from

required
session Optional[Session]

requests session object, defaults to a new session

None
session_query_configs Optional[dict]

session query configs, defaults to get_session_query_configs

None
method Optional[str]

method to use, defaults to "GET"

'GET'
data Optional[dict]

data to send with the request, defaults to None

None

Returns:

Type Description

page content data

Source code in kirsche/utils/web.py
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
def get_page_content(
    link: str,
    session: Optional[requests.Session] = None,
    session_query_configs: Optional[dict] = None,
    method: Optional[str] = "GET",
    data: Optional[dict] = None,
):
    """Download page and save content

    :param link: link to get content from
    :param session: requests session object, defaults to a new session
    :param session_query_configs: session query configs, defaults to get_session_query_configs
    :param method: method to use, defaults to "GET"
    :param data: data to send with the request, defaults to None
    :return: page content data
    """

    if not session_query_configs:
        session_query_configs = get_session_query_configs()

    if not session:
        session = get_session(
            retry_params=None,
            session=None,
        )
    if method == "GET":
        content = session.get(link, **session_query_configs)
    elif method == "POST":
        if data is None:
            data = {}
        content = session.post(link, data=data, **session_query_configs)

    status = content.status_code

    return {"status": status, "content": content}

get_random_user_agent(browsers=None) ¤

get_random_user_agent returns a random user agent. We provide two predefined browers, chrome and firefox.

Parameters:

Name Type Description Default
browsers Optional[Union[str, list]]

which brower to be used, defaults to ["chrome", "firefox"]

None

Returns:

Type Description
dict

dictionary for requests module to consude as {'User-Agent': "blabla"}

Source code in kirsche/utils/web.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def get_random_user_agent(browsers: Optional[Union[str, list]] = None) -> dict:
    """
    get_random_user_agent returns a random user agent.
    We provide two predefined browers, chrome and firefox.

    :param browsers: which brower to be used, defaults to ["chrome", "firefox"]
    :return: dictionary for requests module to consude as {'User-Agent': "blabla"}
    """

    if browsers is None:
        browsers = ["chrome", "firefox"]
    if isinstance(browsers, str):
        browsers = [browsers]

    chrome_user_agents = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36",
        "Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
    ]
    firefox_user_agents = [
        "Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)",
        "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)",
        "Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko",
        "Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko",
        "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)",
        "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko",
        "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)",
        "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)",
        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)",
    ]

    user_agents_dict = {"chrome": chrome_user_agents, "firefox": firefox_user_agents}

    # error if specified browser is not in the list
    if set(browsers) - set(user_agents_dict.keys()):
        logger.error(f"Unknown browser: {set(browsers) - set(user_agents_dict.keys())}")

    user_agent_list = sum([user_agents_dict[browser] for browser in browsers], [])

    return {"User-Agent": random.choice(user_agent_list)}

get_session(retry_params={'retries': 5, 'backoff_factor': 0.3, 'status_forcelist': (500, 502, 504)}, session=None) ¤

get_session prepares a session object.

Parameters:

Name Type Description Default
retry_params Optional[dict]

the rules to retry, defaults to {"retries": 5, "backoff_factor": 0.3, "status_forcelist": (500, 502, 504)}

{'retries': 5, 'backoff_factor': 0.3, 'status_forcelist': (500, 502, 504)}
session Optional[Session]

a requests session object to be used to query, defaults to None

None

Returns:

Type Description
Session

a requests session object

Source code in kirsche/utils/web.py
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
def get_session(
    retry_params: Optional[dict] = {
        "retries": 5,
        "backoff_factor": 0.3,
        "status_forcelist": (500, 502, 504),
    },
    session: Optional[requests.Session] = None,
) -> requests.Session:
    """
    get_session prepares a session object.

    :param retry_params: the rules to retry, defaults to {"retries": 5, "backoff_factor": 0.3, "status_forcelist": (500, 502, 504)}
    :param session: a requests session object to be used to query, defaults to None
    :return: a requests session object
    """

    if retry_params is None:
        retry_params = {
            "retries": 5,
            "backoff_factor": 0.3,
            "status_forcelist": (500, 502, 504),
        }

    if session is None:
        session = requests.Session()

    retry = Retry(
        total=retry_params.get("retries"),
        read=retry_params.get("retries"),
        connect=retry_params.get("retries"),
        backoff_factor=retry_params.get("backoff_factor"),
        status_forcelist=retry_params.get("status_forcelist"),
    )

    adapter = HTTPAdapter(max_retries=retry)
    session.mount("http://", adapter)
    session.mount("https://", adapter)

    return session

get_session_query_configs(headers=None, timeout=(5, 14), proxies={}, cookies={'language': 'en'}) ¤

get_session_query_configs creates a session config dictionary for session to use. These are the keyword arguments of the session get or post methods. Proxies can be set by providing a dictionary of the form

{
    'http': some super_proxy_url,
    'https': some super_proxy_url,
}

Parameters:

Name Type Description Default
headers Optional[dict]

header of the method such as use agent, defaults to random user agent from get_random_user_agent

None
timeout Optional[list]

timeout strategy, defaults to (5, 14)

(5, 14)
proxies Optional[dict]

proxy configs, defaults to {}

{}
cookies Optional[dict]

cookie configs, defaults to {"language": "en"}

{'language': 'en'}

Returns:

Type Description
dict

dictionary of session configs for session methods, e.g., get, to use.

Source code in kirsche/utils/web.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
def get_session_query_configs(
    headers: Optional[dict] = None,
    timeout: Optional[list] = (5, 14),
    proxies: Optional[dict] = {},
    cookies: Optional[dict] = {"language": "en"},
) -> dict:
    """
    get_session_query_configs creates a session config dictionary for session to use. These are the keyword arguments of the session get or post methods.
    Proxies can be set by providing a dictionary of the form
    ```python
    {
        'http': some super_proxy_url,
        'https': some super_proxy_url,
    }
    ```
    :param headers: header of the method such as use agent, defaults to random user agent from get_random_user_agent
    :param timeout: timeout strategy, defaults to (5, 14)
    :param proxies: proxy configs, defaults to {}
    :param cookies: cookie configs, defaults to {"language": "en"}
    :return: dictionary of session configs for session methods, e.g., get, to use.
    """

    if cookies is None:
        cookies = {"language": "en"}

    if headers is None:
        headers = get_random_user_agent()

    if os.getenv("SC_API_KEY") and "x-api-key" not in headers:
        headers["x-api-key"] = os.getenv("SC_API_KEY")

    if timeout is None:
        timeout = (5, 14)

    if proxies is None:
        proxies = {}

    return dict(headers=headers, proxies=proxies, cookies=cookies)