#!/usr/bin/env python3 # GNU social groups scanner. # # This is an ugly screen-scraping methodology for automatically # collecting a database of federated groups. Unfortunately, the /api/statusnet/groups/list_all endpoint on many StatusNet # and GNU social sites seems to require authentication to access, # while the Web UI's groups list is public. # # Pass me a list of server names on the command line if you want to manually specify the list, like: # # $ scanner.py http://node1.url https://node2.url ... # # Or, if no nodes are specified, I'll try to grab a list from fediverse.org. # This program is free software. Use, copy, modify to your heart's content. import sys, os, subprocess, urllib.request, re, json, datetime import html5lib import requests import ssl import lxml from lxml import etree def main(): scandate = datetime.datetime.today().isoformat() nodelist = [] if len(sys.argv) > 1: # Get nodelist from command line for arg in sys.argv: nodelist.append(arg) else: # Get the latest node list from fediverse.org try: listreq = urllib.request.urlopen("https://fediverse.org/") except urllib.error.HTTPError as e: print ("Unable to get node list.",e) return # Now try to get a DOM from the rec'd HTML try: nlistHTML = listreq.read().decode('utf-8') nldom = etree.HTML(nlistHTML) nldomtree = etree.ElementTree(nldom) except AttributeError as e: print("Problem parsing node list. Aborting.",e) return nodeanchors = nldom.xpath(".//a[@class='node_url']") for node in nodeanchors: nodelist.append(node.get("href")) # Scan for groups... groups = getGroups(nodelist) print(str(len(groups)) + " total groups indexed.") # Save the groups database to "groups.json" jsondump = open("./groups.json",mode="w",encoding="utf-8") json.dump(groups,jsondump) # If "pageTemplate.html" exists, generate an HTML table with results and save # it to "scannedGroups.html". If it doesn't, we're done. if not os.path.exists("./pageTemplate.html"): return # Generate an HTML table with scan results. html = "" row = '' for group in groups: newrow = row.replace("!NODEURL",group["server"]) newrow = newrow.replace("!NODE",group["server"].split("/")[2]) newrow = newrow.replace("!URL",group["url"]) newrow = newrow.replace("!NICK",group["nick"]) newrow = newrow.replace("!NAME",group["name"]) newrow = newrow.replace("!DESC",group["desc"]) newrow = newrow.replace("!DATE",group["cdate"]) newrow = newrow.replace("!USERS",group["users"]) html = html + newrow html = html + "

Node	Nickname	Name	Description	User Count	Creation Date
!NODE	!!NICK	!NAME	!DESC	!USERS	!DATE

" scandate = scandate + " -- " + str(len(groups)) + " groups indexed" # Insert the results into a template HTML file pagetemplate = open("./pageTemplate.html",mode="r",encoding="utf-8").read() pagetemplate = pagetemplate.replace("!TABLE",html) pagetemplate = pagetemplate.replace("!REPORT","Last scan: " + scandate) # Then save the new group index HTML. groupfile = open("./scannedGroups.html",mode="w",encoding="utf-8") groupfile.write(pagetemplate) def getGroups(nodelist): groups = [] print("Node list retrieived from fediverse.org.\nStarting scan:\n") for node in nodelist: print("Trying " + node + "...") newgroups = scrape(node) print(" " + str(len(newgroups)) + " groups added.\n") groups = groups + newgroups return groups # -------------------------------------------------- def gsGroup(server,nick,name,url,desc,cdate,users): return {"server" : server, "nick" : nick, "name" : name, "url" : url, "desc" : desc, "cdate" : cdate, "users" : users } def scrape(node): groups = [] groupsPage = 1 morePages = True groupsBrowser = True print("Requesting groups from " + node + " --:\n") while morePages: print("Trying page " + str(groupsPage)) # First, try to get the /groups? index, because it's more verbose. if(groupsBrowser): groupsPageUrl = "node" + "/groups?page=" + str(groupsPage) else: groupsPageUrl = "node" + "/group?page=" + str(groupsPage) # Create an SSL context so that we don't get hung-up on certificate # hostname mismatches and shit. sslcontext = ssl.create_default_context() sslcontext.check_hostname = False #sslcontext.verify_mode = ssl.CERT_NONE # Try to GET it... try: req = requests.get(node + "/groups?page=" + str(groupsPage), timeout=20) except: if groupsBrowser: print(" Verbose index unavailable...") groupsBrowser = False continue else: print(" Vanilla index unavailable... giving up. :( ") return groups # Try to parse it... try: ixHTML = req.text ixdom = etree.HTML(ixHTML) ixdomtree = etree.ElementTree(ixdom) except: if groupsBrowser: print(" Verbose index unparsable...") groupsBrowser = False continue else: print(" Vanilla index unparsable... giving up. :(") return groups # Is this an error page? This usually means we've gotten to the # end of the groups index. if len(ixdom.cssselect(".error")) > 0: print(" " + str(groupsPage) + " pages scanned.") morePages = False break # This bit naively assumes that the cssselect will return # its results in document order. if node.find("/rainbowdash.net") > 0: # We have to make a special exception for the exceptionally # special community at Rainbow Dash, because their groups pages # are not the same as other StatusNet/GNUsocial sites. # This really needs to be fixed, because Rainbow Dash has a TON of groups. #print("Skipping Rainbow Dash.") if len(ixdom.cssselect("a[rel='next']")) < 1: morePages = False # Get all the containers for the group entries groupEntries = ixdom.cssselect("#content_inner li.hentry") for e in groupEntries: # Rainbow Dash includes groups that are off-site, such as # Facebook groups. These come up with a link to # http://rainbowdash.net/group on their avatar image. # Skip over these. groupUrl = e.cssselect("a.entry-title")[0].get("href") if groupUrl == "http://rainbowdash.net/group": continue groupNick = e.cssselect("span.nickname")[0].text groupName = e.cssselect("span.fn") if len(groupName) < 1: groupName = "" else: groupName = groupName[0].text groupDesc = e.cssselect("p.note") if len(groupDesc) < 1: groupDesc = "" else: groupDesc = groupDesc[0].text newgroup = gsGroup(node, groupNick, groupName, groupUrl, groupDesc, "", "") groups.append(newgroup) else: if node.find("/social.ilikefreedom.ro") > 0: # I have no idea why, but this one gets stuck in the while loop, adding the # same eight groups over and over. morePages = False groupNicks = ixdom.cssselect(".p-nickname") if len(groupNicks) < 1: print(" " + str(groupsPage) + " pages scanned.") morePages = False break groupCdates = ixdom.cssselect(".entry_created") groupCounts = ixdom.cssselect(".entry_member_count") if len(groupCdates) == 0: groupCdates = [{"text":""} for i in range(0,len(groupNicks))] if len(groupCounts) == 0: groupCounts = [{"text":""} for i in range(0,len(groupNicks))] groupNames = []; groupDescs = []; for nick in groupNicks: descContainer = nick.getparent() pnames = descContainer.cssselect(".p-name") if len(pnames) > 0: groupNames.append(pnames[0].text) else: groupNames.append("") notes = descContainer.cssselect(".note") if len(notes) > 0: groupDescs.append(notes[0].text) else: groupDescs.append("") for ix in range(0,len(groupNicks)): groupserver = groupNicks[ix].get("href").split("/")[2] nodeserver = node.split("/")[2] if groupserver == nodeserver: try: cdate = groupCdates[ix].text except: cdate = "n/a" try: users = groupCounts[ix].text except: users = "n/a" newgroup = gsGroup(node, groupNicks[ix].text, groupNames[ix], groupNicks[ix].get("href"), groupDescs[ix], cdate, users) groups.append(newgroup) groupsPage = groupsPage + 1 return groups main()