2021-03-11 19:33:17

Hello all,
For a bit of a break from my usual projects, I thought I'd get back into web scraping. As an exercise to see how much data is generated, I thought I'd write a script to download this forum, almost in its entirety, and save it in an SQLite database.

I converted the text of posts to markdown, but other than that I left everything alone.

I'm posting this here so anyone who's interested in web scraping can see how it's done, not because I condone idiots storing everything they ever come across. I'm also not sure of the legality of holding that kind of data (although this script only downloads usernames), so I'll personally be deleting the database once it's done.

Honestly, the only thing I'm interested in is who's posted the most! smile

All of that said, here's the code for anyone who's interested:

"""The audiogames.net forum downloader."""

from datetime import datetime
from typing import Iterator, List, Optional, Union

from bs4 import BeautifulSoup
from bs4.element import NavigableString, Tag
from html2markdown import convert
from requests import Response
from requests import Session as RequestsSession
from sqlalchemy import Column, DateTime, ForeignKey, Integer, String, create_engine
from sqlalchemy.engine.base import Engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import Query, relationship, sessionmaker
from sqlalchemy.orm.relationships import RelationshipProperty

FindType = Union[Tag, NavigableString]
engine: Engine = create_engine("sqlite:///db.sqlite3")


class _BaseClass:
    """Add an ID."""

    __tablename__: str
    id = Column(Integer, primary_key=True)

    def save(self) -> None:
        """Save this instance."""
        session.add(self)
        session.commit()

    @classmethod
    def query(cls, *args, **kwargs) -> Query:
        """Return a query linked to this class."""
        return session.query(cls).filter(*args).filter_by(**kwargs)

    @classmethod
    def count(cls, *args, **kwargs) -> int:
        """Return the number of rows that match the given criteria."""
        return cls.query(*args, **kwargs).count()


Base = declarative_base(bind=engine, cls=_BaseClass)


Session = sessionmaker(bind=engine)
session = Session()


class NameMixin:
    """Add a name parameter."""

    id: int

    name = Column(String(1024), nullable=False)

    def __str__(self) -> str:
        """Return a string representation of this object."""
        return f"{self.name} (#{self.id})"


class User(Base, NameMixin):  # type:ignore[valid-type, misc]
    """A forum user."""

    __tablename__ = "users"
    name = Column(String(50), nullable=False)
    registered = Column(DateTime(timezone=True), nullable=True)


class Room(Base, NameMixin):  # type:ignore[valid-type, misc]
    """A room in the forum."""

    __tablename__ = "rooms"


class Thread(Base, NameMixin):  # type:ignore[valid-type, misc]
    """A forum thread."""

    __tablename__ = "threads"
    user_id = Column(Integer, ForeignKey("users.id"), nullable=True)
    user: RelationshipProperty = relationship("User", backref="threads")
    room_id = Column(Integer, ForeignKey("rooms.id"), nullable=False)
    room: RelationshipProperty = relationship("Room", backref="threads")


class Post(Base):  # type:ignore[valid-type, misc]
    """A forum post."""

    __tablename__ = "posts"
    posted = Column(DateTime(timezone=True), nullable=False)
    text = Column(String(65535), nullable=True)
    url = Column(String(1024), nullable=False)
    user_id = Column(Integer, ForeignKey("users.id"), nullable=False)
    user: RelationshipProperty = relationship("User", backref="posts")
    thread_id = Column(Integer, ForeignKey("threads.id"), nullable=False)
    thread: RelationshipProperty = relationship("Thread", backref="posts")


Base.metadata.create_all()
url: str = "https://forum.audiogames.net/"
http: RequestsSession = RequestsSession()


def main() -> None:
    """Start scraping."""
    response: Response = http.get(url)
    soup: BeautifulSoup = BeautifulSoup(response.text, "lxml")
    h3: Tag
    tags: Iterator[FindType] = soup.find_all("h3")
    for h3 in tags:
        assert isinstance(h3, Tag)
        parse_room(h3)


def parse_room(h3: Tag) -> None:
    """Parse a room from a link.

    :param h3: The level 3 heading containing the link from the main forum.
    """
    a: Optional[FindType] = h3.find("a")
    if a is None or isinstance(a, NavigableString):
        raise RuntimeError("Invalid room link:\n%s" % h3)
    href: str = a["href"]
    name: str = a.text
    room: Optional[Room] = Room.query(name=name).first()
    if room is None:
        room = Room(name=a.text)
        room.save()
        print(f"Created room {room.name}.")
    else:
        print(f"Using existing room {room}.")
    response = http.get(href)
    soup = BeautifulSoup(response.text, "lxml")
    p: Optional[FindType] = soup.find("p", attrs={"class": "paging"})
    if p is None or isinstance(p, NavigableString):
        return print("Cannot find page links for this room.")
    links: List[FindType] = p.find_all("a")
    a = links[-2]
    assert isinstance(a, Tag)
    parse_pages(room, a)


def parse_pages(room: Room, a: Tag) -> None:
    """Parse pages of threads for a particular room.

    :param room: The room to work in.

    :param a: The link to the page with the highest number.
    """
    href = a["href"][:-1]
    href = href[: href.rindex("/") + 1] + "%d"
    page: int = int(a.text)
    while page > 0:
        print(f"Parsing page {page}.")
        response = http.get(href % page)
        soup = BeautifulSoup(response.text, "lxml")
        tags = soup.find_all("h3")
        for h3 in tags:
            assert isinstance(h3, Tag)
            parse_thread(room, h3)
        room.save()
        page -= 1


def parse_thread(room: Room, h3: Tag) -> None:
    """Parse a particular thread in the given room.

    :param room: The room to work in.

    :param h3: The level 3 heading containing the link to the thread to parse.
    """
    a = h3.find("a")
    assert isinstance(a, Tag)
    name = a.text
    href = a["href"]
    thread: Optional[Thread] = Thread.query(name=name, room=room).first()
    if thread is None:
        thread = Thread(name=name, room=room)
    response = http.get(href)
    soup = BeautifulSoup(response.text, "lxml")
    tags = soup.find_all("div", attrs={"class": "post"})
    div: Tag
    for div in tags:
        assert isinstance(div, Tag)
        parse_message(thread, div)


def parse_message(thread: Thread, div: Tag) -> None:
    """Parse the given message.

    :param thread: The thread this message will belong to.

    :param div: The div element containing the message to parse.
    """
    href = div.find_all("a")[0]["href"]
    post_id: str = href[len(url) :]
    post_id = post_id[len("post/") :].split("/")[0]
    if Post.count(id=post_id) > 0:
        return print(f"Skipping message #{post_id}.")
    span: Optional[FindType] = div.find("span", attrs={"class": "post-byline"})
    assert isinstance(span, Tag)
    username: str = span.find("strong").text
    user: Optional[User] = User.query(name=username).first()
    if user is None:
        print(f"Creating user {username}.")
        ul: Optional[FindType] = div.find("ul", attrs={"class": "author-info"})
        assert isinstance(ul, Tag)
        li: Optional[FindType] = ul.find(
            lambda t: t.name == "span" and t.text.startswith("Registered:")
        )
        registered: Optional[datetime] = None
        if li is not None:
            registered = datetime.fromisoformat(li.find("strong").text)
        user = User(name=username, registered=registered)
        user.save()
    else:
        print(f"Using existing user {user}.")
    if "firstpost" in div["class"]:
        print(f"{username} is thread starter.")
        thread.user = user
        thread.save()
    content: Optional[FindType] = div.find("div", attrs={"class": "entry-content"})
    assert isinstance(content, Tag)
    signature: Optional[FindType] = content.find("div")
    span = div.find("span", attrs={"class": "post-link"})
    assert isinstance(span, Tag)
    posted: datetime = datetime.fromisoformat(span.text)
    strings: List[str] = []
    child: FindType
    for child in content:
        if isinstance(child, NavigableString):
            continue
        if child is not signature:
            strings.append(convert(str(child)))
    post: Post = Post(
        id=int(
            post_id,
        ),
        posted=posted,
        text="\n\n".join(strings),
        user=user,
        thread=thread,
        url=href,
    )
    print(f"Created post #{post_id}.")
    post.save()


if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        print("Aborted.")
    finally:
        print(f"Users: {User.count()}")
        print(f"Threads: {Thread.count()}")
        print(f"Posts: {Post.count()}.")
        session.close()
-----
I have code on GitHub

2021-03-11 19:55:21

It's legal as long as you didn't authenticate first: this is why the wayback machine can exist.

I don't remember what they are but there are already tools for this that will automatically follow every link and scrape to a folder of your choice.  I think even quite a few CLI ones?  For some reason I want to say aria2c can do it.

Be aware that cloudflare won't like this behavior: your IP will eventually get blocked in all likelihood.

My Blog
Twitter: @ajhicks1992

2021-03-11 20:39:56

@2
Nice, I thought aria2c was a torrent client. Honestly, I was more bothered about the practise than the results.

-----
I have code on GitHub

2021-03-12 09:59:02

HTTrack can do it, but he parses to HTML only.

If you want to contact me, do not use the forum PM. I respond once a year or two, when I need to write a PM myself. I apologize for the inconvenience.
Telegram: Nuno69a
E-Mail: nuno69a (at) gmail (dot) com

2021-03-12 12:56:49 (edited by ashleygrobler04 2021-03-12 12:58:20)

mine just says "do you need to install a parcer library"?
Edit: had to install lxml and it works...

best regards
never give up on what ever you are doing.

2021-03-12 16:16:25

moderation:
I recognize that this is legal, so have at it. However, in an effort to minimize undo strain on the server, I'd like to ask that we try and keep full downloads to a minimum. This forum has over 35000 topics, each adding up to about 577000 posts and, by my calculations, at least 4GB of generated HTML, more depending on fetched resources.
end moderation

That having been said, I actually wrote an equally sophisticated though messier script for personal use that attempts to gather everything possible into a PunBB-ish database, in the event that an asteroid tanks into our server, as the best I can personally do for an off-sight backup solution.

I normally do it with wget's --mirror option and an insane number of exclusions. The benefit being that you get a semi-functional and quicker copy of the forum running offline for searchless consumption anywhere, the problem being that you actually have to find those exclusions, meaning pausing the download whenever it grabs a directory or script you don't want hopefully before hours are spent saving what'll have to be deleted later. Once you've done this though it's smooth sailing. Here, /post/ is a prime example. By default, I believe copies of threads would be made for literally every single post.

As an exercise, it could be potentially interesting to run a form of statistical analysis on the generated database and publish the findings, given that AgNet represents a microcosm of the larger blind community.

2021-03-12 16:36:34

@5
Yeah, sorry, only just posted the requirements file.

@6
Thanks for the moderation. I put in sleeps, so it shouldn't be too bad. Most of the processing should be done by the script, although I'm sure even the most idiotic programmer could remove those lines.

Anyways, if you want to use my script (which is now a lot better) to make offsite backups, it now skips over posts it's already dealt with, so you could run it weekly to get the most relevant stuff.

It's on my GitHub.

Just as an FYI, Dark has the most posts, with 24827.

BTW, if you do decide to use my script for offsite backups, let me know, and I can modify it to include entries from the games database.

It currently doesn't do that, and I've no real interest in scraping the data, now that I've proved to myself that I can if I want to.

-----
I have code on GitHub

2021-03-12 17:28:48

Someone should scrape though, being as the forum is going to fall down forever one of these days.

My Blog
Twitter: @ajhicks1992

2021-03-12 20:49:11

hi
@7, whenever I try to download it from github via the download zip file option, I just get an internal server error, anyone knows how to make this file download

me and a friend of mine have made a youtube channel where we upload beats that we make. If you want to check it out, you can find it at https://www.youtube.com/channel/UCD0CxF … PFlCqjOtOA