%matplotlib inline
from pandas import DataFrame, read_csv
import matplotlib as plt
import matplotlib.pyplot as gplt
import pandas as pd
import sys
print 'Python version ' + sys.version
print 'Pandas version ' + pd.__version__
print 'Matplotlib version ' + plt.__version__
dfseeds = read_csv('seeds-report.txt', sep = ' ')
dfseeds
codes = dfseeds['[code]'].value_counts()
codes
codes_to_plot = codes[:6]
plt = codes_to_plot.plot(kind='pie', figsize=(6, 6), autopct='%.2f%%',fontsize=15)
plt.set_title('Seeds return codes weigth', fontsize=25)
dfhosts = read_csv('hosts-report.txt', delimiter = ' ', index_col=False)
dfhosts['TOTAL_URLS'] = dfhosts['[#urls]'] + dfhosts['[#remaining]']
order_dfhosts = dfhosts.ix[1:].sort(columns='TOTAL_URLS', ascending=False)
plt = order_dfhosts[['TOTAL_URLS','[host]']][:20].plot(kind='bar',x='[host]',figsize=(10,10))
plt.set_ylabel('Number of URLs', fontsize=15)
plt.set_xlabel('Hosts', fontsize=15)
plt.set_title('TOP 20 Hosts with mode URLs', fontsize=25)
dfhosts[['[#urls]','[host]']][1:31]
dfhosts['[#remaining]'].sum()
ax = dfhosts[['[host]','[#remaining]']].sort(columns='[#remaining]', ascending = False)[:20].plot(kind='bar',x='[host]',figsize=(10,10))
ax.set_ylabel('Number of URLs', fontsize=15)
ax.set_xlabel('Hosts', fontsize=15)
ax.set_title('TOP 20 hosts with more URLs remaining', fontsize=25)
dfhosts['[#remaining]'].describe()
dfhosts['[#urls]'].describe()
df_urls = read_csv('seeds_extracted.txt', header=None, names=['URL'])
df_eu_urls = df_urls[df_urls['URL'].str.contains('^.*.eu$')]
print "Number of seeds round1:"
df_eu_urls.count()
df_eu_urls_2 = df_eu_urls[~df_eu_urls['URL'].str.contains('.*dbquanti.eu$|.*autobazar.eu$|.*in-links.eu$|.*myface4u.eu$|.*share-with.eu$|.*prace-jobs.eu$|.*cutegirls.eu$')]
print "Number of seeds round2:"
df_eu_urls_2.count()
df_eu_urls_3 = df_eu_urls_2[~df_eu_urls['URL'].str.contains('.*\.e-mp3s.eu$')]
print "Number of seeds round3:"
df_eu_urls_3.count()
d = DataFrame(df_eu_urls_3['URL'].apply(lambda str : str.split('.')[-2:]))
dauxfilter = DataFrame(d['URL'].apply(lambda x : x[0] + "." + x[1]))
dfilter = dauxfilter['URL'].value_counts()
dfilter[:20]