#!/usr/bin/env python3
# GNU social groups scanner.
#
# This is an ugly screen-scraping methodology for automatically
# collecting a database of federated groups. Unfortunately, the /api/statusnet/groups/list_all endpoint on many StatusNet
# and GNU social sites seems to require authentication to access,
# while the Web UI's groups list is public.
#
# Pass me a list of server names on the command line if you want to manually specify the list, like:
#
# $ scanner.py http://node1.url https://node2.url ...
#
# Or, if no nodes are specified, I'll try to grab a list from fediverse.org.
# This program is free software. Use, copy, modify to your heart's content.
import sys, os, subprocess, urllib.request, re, json, datetime
import html5lib
import requests
import ssl
import lxml
from lxml import etree
def main():
scandate = datetime.datetime.today().isoformat()
nodelist = []
if len(sys.argv) > 1:
# Get nodelist from command line
for arg in sys.argv:
nodelist.append(arg)
else:
# Get the latest node list from fediverse.org
try:
listreq = urllib.request.urlopen("https://fediverse.org/")
except urllib.error.HTTPError as e:
print ("Unable to get node list.",e)
return
# Now try to get a DOM from the rec'd HTML
try:
nlistHTML = listreq.read().decode('utf-8')
nldom = etree.HTML(nlistHTML)
nldomtree = etree.ElementTree(nldom)
except AttributeError as e:
print("Problem parsing node list. Aborting.",e)
return
nodeanchors = nldom.xpath(".//a[@class='node_url']")
for node in nodeanchors:
nodelist.append(node.get("href"))
# Scan for groups...
groups = getGroups(nodelist)
print(str(len(groups)) + " total groups indexed.")
# Save the groups database to "groups.json"
jsondump = open("./groups.json",mode="w",encoding="utf-8")
json.dump(groups,jsondump)
# If "pageTemplate.html" exists, generate an HTML table with results and save
# it to "scannedGroups.html". If it doesn't, we're done.
if not os.path.exists("./pageTemplate.html"):
return
# Generate an HTML table with scan results.
html = "
Node | Nickname | Name | Description | User Count | Creation Date |
"
row = '!NODE | !!NICK | !NAME | !DESC | !USERS | !DATE |
'
for group in groups:
newrow = row.replace("!NODEURL",group["server"])
newrow = newrow.replace("!NODE",group["server"].split("/")[2])
newrow = newrow.replace("!URL",group["url"])
newrow = newrow.replace("!NICK",group["nick"])
newrow = newrow.replace("!NAME",group["name"])
newrow = newrow.replace("!DESC",group["desc"])
newrow = newrow.replace("!DATE",group["cdate"])
newrow = newrow.replace("!USERS",group["users"])
html = html + newrow
html = html + "
"
scandate = scandate + " -- " + str(len(groups)) + " groups indexed"
# Insert the results into a template HTML file
pagetemplate = open("./pageTemplate.html",mode="r",encoding="utf-8").read()
pagetemplate = pagetemplate.replace("!TABLE",html)
pagetemplate = pagetemplate.replace("!REPORT","Last scan: " + scandate)
# Then save the new group index HTML.
groupfile = open("./scannedGroups.html",mode="w",encoding="utf-8")
groupfile.write(pagetemplate)
def getGroups(nodelist):
groups = []
print("Node list retrieived from fediverse.org.\nStarting scan:\n")
for node in nodelist:
print("Trying " + node + "...")
newgroups = scrape(node)
print(" " + str(len(newgroups)) + " groups added.\n")
groups = groups + newgroups
return groups
# --------------------------------------------------
def gsGroup(server,nick,name,url,desc,cdate,users):
return {"server" : server,
"nick" : nick,
"name" : name,
"url" : url,
"desc" : desc,
"cdate" : cdate,
"users" : users }
def scrape(node):
groups = []
groupsPage = 1
morePages = True
groupsBrowser = True
print("Requesting groups from " + node + " --:\n")
while morePages:
print("Trying page " + str(groupsPage))
# First, try to get the /groups? index, because it's more verbose.
if(groupsBrowser):
groupsPageUrl = "node" + "/groups?page=" + str(groupsPage)
else:
groupsPageUrl = "node" + "/group?page=" + str(groupsPage)
# Create an SSL context so that we don't get hung-up on certificate
# hostname mismatches and shit.
sslcontext = ssl.create_default_context()
sslcontext.check_hostname = False
#sslcontext.verify_mode = ssl.CERT_NONE
# Try to GET it...
try:
req = requests.get(node + "/groups?page=" + str(groupsPage), timeout=20)
except:
if groupsBrowser:
print(" Verbose index unavailable...")
groupsBrowser = False
continue
else:
print(" Vanilla index unavailable... giving up. :( ")
return groups
# Try to parse it...
try:
ixHTML = req.text
ixdom = etree.HTML(ixHTML)
ixdomtree = etree.ElementTree(ixdom)
except:
if groupsBrowser:
print(" Verbose index unparsable...")
groupsBrowser = False
continue
else:
print(" Vanilla index unparsable... giving up. :(")
return groups
# Is this an error page? This usually means we've gotten to the
# end of the groups index.
if len(ixdom.cssselect(".error")) > 0:
print(" " + str(groupsPage) + " pages scanned.")
morePages = False
break
# This bit naively assumes that the cssselect will return
# its results in document order.
if node.find("/rainbowdash.net") > 0:
# We have to make a special exception for the exceptionally
# special community at Rainbow Dash, because their groups pages
# are not the same as other StatusNet/GNUsocial sites.
# This really needs to be fixed, because Rainbow Dash has a TON of groups.
#print("Skipping Rainbow Dash.")
if len(ixdom.cssselect("a[rel='next']")) < 1:
morePages = False
# Get all the containers for the group entries
groupEntries = ixdom.cssselect("#content_inner li.hentry")
for e in groupEntries:
# Rainbow Dash includes groups that are off-site, such as
# Facebook groups. These come up with a link to
# http://rainbowdash.net/group on their avatar image.
# Skip over these.
groupUrl = e.cssselect("a.entry-title")[0].get("href")
if groupUrl == "http://rainbowdash.net/group":
continue
groupNick = e.cssselect("span.nickname")[0].text
groupName = e.cssselect("span.fn")
if len(groupName) < 1:
groupName = ""
else:
groupName = groupName[0].text
groupDesc = e.cssselect("p.note")
if len(groupDesc) < 1:
groupDesc = ""
else:
groupDesc = groupDesc[0].text
newgroup = gsGroup(node,
groupNick,
groupName,
groupUrl,
groupDesc,
"",
"")
groups.append(newgroup)
else:
if node.find("/social.ilikefreedom.ro") > 0:
# I have no idea why, but this one gets stuck in the while loop, adding the
# same eight groups over and over.
morePages = False
groupNicks = ixdom.cssselect(".p-nickname")
if len(groupNicks) < 1:
print(" " + str(groupsPage) + " pages scanned.")
morePages = False
break
groupCdates = ixdom.cssselect(".entry_created")
groupCounts = ixdom.cssselect(".entry_member_count")
if len(groupCdates) == 0:
groupCdates = [{"text":""} for i in range(0,len(groupNicks))]
if len(groupCounts) == 0:
groupCounts = [{"text":""} for i in range(0,len(groupNicks))]
groupNames = [];
groupDescs = [];
for nick in groupNicks:
descContainer = nick.getparent()
pnames = descContainer.cssselect(".p-name")
if len(pnames) > 0:
groupNames.append(pnames[0].text)
else:
groupNames.append("")
notes = descContainer.cssselect(".note")
if len(notes) > 0:
groupDescs.append(notes[0].text)
else:
groupDescs.append("")
for ix in range(0,len(groupNicks)):
groupserver = groupNicks[ix].get("href").split("/")[2]
nodeserver = node.split("/")[2]
if groupserver == nodeserver:
try:
cdate = groupCdates[ix].text
except:
cdate = "n/a"
try:
users = groupCounts[ix].text
except:
users = "n/a"
newgroup = gsGroup(node,
groupNicks[ix].text,
groupNames[ix],
groupNicks[ix].get("href"),
groupDescs[ix],
cdate,
users)
groups.append(newgroup)
groupsPage = groupsPage + 1
return groups
main()