Be more careful crawling the stuff
Specify credible user agent while retrieving data from URLs. Also change default member report update to 30 days instead of week. Partial bug 1361927 Change-Id: I7f956912c2cc3aa2765a20ed193e1e34ecbf7f91
This commit is contained in:
parent
9f8d82d801
commit
8d08624711
@ -18,7 +18,7 @@
|
||||
# listen_port = 8080
|
||||
|
||||
# Number of days to update members
|
||||
# days_to_update_members = 7
|
||||
# days_to_update_members = 30
|
||||
|
||||
# The address of file with corrections data
|
||||
# corrections_uri = https://git.openstack.org/cgit/stackforge/stackalytics/plain/etc/corrections.json
|
||||
|
@ -29,7 +29,7 @@ OPTS = [
|
||||
help='The address dashboard listens on'),
|
||||
cfg.IntOpt('listen-port', default=8080,
|
||||
help='The port dashboard listens on'),
|
||||
cfg.IntOpt('days_to_update_members', default=7,
|
||||
cfg.IntOpt('days_to_update_members', default=30,
|
||||
help='Number of days to update members'),
|
||||
cfg.StrOpt('corrections-uri',
|
||||
default=('https://git.openstack.org/cgit/'
|
||||
|
@ -12,6 +12,7 @@
|
||||
# implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import random
|
||||
|
||||
import re
|
||||
import time
|
||||
@ -106,5 +107,7 @@ def log(uri, runtime_storage_inst, days_to_update_members, members_look_ahead):
|
||||
LOG.debug('New member: %s', member['member_id'])
|
||||
yield member
|
||||
|
||||
time.sleep(random.random() * 5)
|
||||
|
||||
LOG.debug('Last_member_index: %s', last_member_index)
|
||||
runtime_storage_inst.set_by_key('last_member_index', last_member_index)
|
||||
|
@ -17,6 +17,7 @@ import cgi
|
||||
import datetime
|
||||
import gzip
|
||||
import json
|
||||
import random
|
||||
import re
|
||||
import time
|
||||
|
||||
@ -86,9 +87,20 @@ def check_email_validity(email):
|
||||
return False
|
||||
|
||||
|
||||
user_agents = [
|
||||
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64) Gecko/20100101 Firefox/32.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_6) AppleWebKit/537.78.2',
|
||||
'Mozilla/5.0 (Windows NT 6.3; WOW64) Gecko/20100101 Firefox/32.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X) Chrome/37.0.2062.120',
|
||||
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'
|
||||
]
|
||||
|
||||
|
||||
def read_uri(uri):
|
||||
try:
|
||||
fd = six.moves.urllib.request.urlopen(uri)
|
||||
req = six.moves.urllib.request.Request(
|
||||
url=uri, headers={'User-Agent': random.choice(user_agents)})
|
||||
fd = six.moves.urllib.request.urlopen(req)
|
||||
raw = fd.read()
|
||||
fd.close()
|
||||
return raw
|
||||
|
Loading…
x
Reference in New Issue
Block a user