-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaffiliate_utils.py
122 lines (97 loc) · 3.35 KB
/
affiliate_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
"""Affiliate Utils
Utility module to process affiliate URLs.
Supports the following URLs:
* TopCashback
* Mediaplex
"""
import lxml.html
import urlparse
import urllib2
__all__ = ['url_to_domain', 'strip_www', 'get_target']
def url_to_domain(url):
"""
Returns the domain portion of a URL.
>>> url_to_domain('http://infinitemonkeycorps.net/docs/pph/#unittest')
'infinitemonkeycorps.net'
:param url: URL
:type url: string
:return: Domain portion of URL
:rtype: string
"""
# handle URLs without a scheme nicely
if urlparse.urlparse(url).scheme not in ['http', 'https']:
url = 'http://' + url
return urlparse.urlparse(url).hostname
def strip_www(domain):
"""
Strips any www. prefix from a domain name.
>>> strip_www('www.google.com')
'google.com'
:param domain: domain name
:type domain: string
:return: Domain name with any 'www.' prefix removed
:rtype: string
"""
if domain.startswith('www.'):
return domain[4:]
else:
return domain
def get_target(url):
"""
Returns the target URL of an affiliate link.
>>> get_target('https://www.topcashback.co.uk/earncashback.aspx?mpurl=currys&continue=1')
'http://www.currys.co.uk/gbuk/index.html?srcid=369&xtor=AL-1&cmpid=aff~TopCashBack~'
:param url: URL
:type url: string
:return: Target URL that supplied URL resolves to
:rtype: string
"""
domain = strip_www(url_to_domain(url))
if domain in ['topcashback.com', 'topcashback.co.uk']:
return _get_tcb_target(url)
elif domain in ['adfarm.mediaplex.com']:
return _get_tcb_target(url)
else:
return url
def _follow_first_link(url):
"""Returns the target URL by following the first <A HREF='...'>link</A>
found.
"""
response = urllib2.urlopen(url)
# TODO: error checking
html = response.read()
# TopCashback brings up a banner page with Javascript redirect
# - find the manual redirect
root = lxml.html.fromstring(html)
links = root.xpath("//a")
url = links[0].attrib['href']
target_url = urllib2.urlopen(url).url
return target_url
def _get_tcb_target(url):
"""Returns the target URL of a TopCashback splash screen forwarding page."""
return _follow_first_link(url)
def _get_mediaplex_target(url):
"""Returns the target URL of a Mediaplex forwarding page.
Source: http://adfarm.mediaplex.com/ad/ck/15368-110724-36269-43?
CJAID=801842&CJPID=1777643&ttp=100&rfr=123
Content:
--------
<html><head><title></title>
<script language="JavaScript1.1">
<!--
window.location.replace("https://promotions.betfair.com/value-uk-
football-dual-aff-t?CID=&PLA=153681107243626943&ttp=100&rfr=123&
mpch=ads");
//-->
</script>
<noscript>
<meta http-equiv="refresh"
content="0;URL=https://promotions.betfair.com/value-uk-football-
dual-aff-t?CID=&PLA=153681107243626943&ttp=100&rfr=123&mpch=ads">
</noscript>
</head><body><a href="https://promotions.betfair.com/value-uk-football
-dual-aff-t?CID=&PLA=153681107243626943&ttp=100&rfr=123&mpch=ads">
Click Here</a></body></html>
Target: https://promotions.betfair.com/value-uk-football-dual-aff-t?
CID=&PLA=153681107243626943&ttp=100&rfr=123&mpch=ads"""
return _follow_first_link(url)