skip to Main Content

I want to convert python parser to json. Here’s a code of my python parser.

import re
import time
from urllib.parse import quote, unquote
from urllib.request import urlopen 
import requests
from bs4 import BeautifulSoup
url = "https://fasie.ru"
page = urlopen(url)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
div = soup.find_all('div', class_ = 'wrap')
programms_list = div[1].find('ul', class_='').find_all('ul', class_='')[1]
hrefs = programms_list.find_all('a')
download_links = set()
response = requests.get(url+'/programs')
parse_text = BeautifulSoup(response.text, 'html.parser')
links = set([x.get('href') for x in parse_text.find_all(href=re.compile('^/programs/'))])
def main():
    for h in hrefs:
        url_h = f"https://fasie.ru{h.get('href')}"
        page_h = urlopen(url_h)
        html_h = page_h.read().decode("utf-8")
        soup_h = BeautifulSoup(html_h, "html.parser")
        sections = soup_h.find_all('section')
        for s in sections:
            print(s.text)
    for link in links:
        response = requests.get(url+link)
        parse_text = BeautifulSoup(response.text, 'html.parser')
        download_links.update(set([x.get('href') for x in parse_text.find_all(href=re.compile('^/upload/docs'))]))
        for link in download_links:
            file_name = unquote(link).split('/')[-1]
            response = requests.get(url+quote(link))
            with open(file_name, 'wb') as f:
                f.write(response.content)
main()

And here’s what json should look like

[
  {
    "source": "Ссылка, откуда взята информация", // в данном случае ссылка fasie.ru
    "name": "ИнноШкольник",
    "description": "Информация из вкладки `О программе`",
    "program": "Данные из вкладки `Конкурсы, программы` в формате HTML",
    "contacts": [
      {
        "name": "Имя контакта",
        "tel": "Телефон",
        "email": "Почта контакта"
      }
    ],
    "documents": [
      {
        "source": "Ссылка на файл оригинальная, т.е откуда скачали",
        "path": "Относительный путь к файлу (уже скачанного)",
        "name": "Название файла",
        "extension": "Расширение файла (напр. pdf)",
        "size": 123 // Размер в байтах
      }
    ]
  }
]

It should be to create a bot that will automatically output information from the python code to a JSON

2

Answers


  1. Chosen as BEST ANSWER

    Found it. Thx to user510170

    import json
    import codecs
    import re
    import time
    from urllib.parse import quote, unquote
    from urllib.request import urlopen 
    import requests
    from bs4 import BeautifulSoup
    from sys import getdefaultencoding
    import yaml
    import requests
    import bleach
    
    url = "https://fasie.ru"
    page = urlopen(url)
    html = page.read().decode("utf-8")
    soup = BeautifulSoup(html, "html.parser")
    div = soup.find_all('div', class_ = 'wrap')
    programms_list = div[1].find('ul', class_='').find_all('ul', class_='')[1]
    hrefs = programms_list.find_all('a')
    download_links = set()
    response = requests.get(url+'/programs')
    parse_text = BeautifulSoup(response.text, 'html.parser')
    links = set([x.get('href') for x in parse_text.find_all(href=re.compile('^/programs/'))])
    programs = []
    
    def main():
        for h in hrefs:
            program = {}
            url_h = f"https://fasie.ru{h.get('href')}"
            page_h = urlopen(url_h)
            html_h = page_h.read().decode("utf-8")
            soup_h = BeautifulSoup(html_h, "html.parser")
            soup_b = BeautifulSoup(html_h, 'lxml')
            description = soup_h.find('section', {'id': 'content-tab3'})
            program['source'] = url_h
            program['name'] = h.text.strip()
            program['description'] = description.text.strip().replace('n', '').replace('t', '').replace('r', '') if description else ''
            program['program'] = str(soup_h.find('section', {'id': 'content-tab1'}).get_text()).replace('n', ' ').replace('t', ' ').replace('r', ' ')
            try:
                notag = soup_b.find('div', class_='tabs').find('section', id='content-tab5').find_all('tr') 
            except AttributeError:
                notag = soup_b.find('div', class_='tabs').find('section', id='content-tab4').find_all('p')
            for n in notag:
                nams = []
                tels = []
                emails = []
                contacts = []
                contact = {}
                try:
                    nam=[i.text.replace('n', ' ').replace('t', ' ').replace('r', ' ').replace(u'xa0', u' ').replace('   ', '') for i in n.find_all('h4')]
                    if nam==[]:
                        nam=[i.text.replace('n', ' ').replace('t', ' ').replace('r', ' ').replace(u'xa0', u' ').replace('   ', '') for i in n.find_all('b')]
                    for i in nam:
                        if i==' ':
                            pass
                        else:
                            nams.append(i)
                    email = n.find('a').text.replace('n', ' ').replace('t', ' ').replace('r', ' ').replace(' ', '')
                    if email=='':
                        pass
                    else:
                        emails.append(email)
                    tel = [i.text.replace('n', ' ').replace('t', ' ').replace('r', ' ').replace(u'xa0', u' ').replace('   ', '') for i in n.find_all('nobr')]
                    for i in tel:
                        if i=='':
                            pass
                        else:
                            tels.append(i)
                except AttributeError:
                    pass
            contact['namе'] = nams
            contact['email'] = emails
            contact['tel'] = tels
            contacts.append(contact)
            program['contacts'] = contacts
                
    
            sections = soup_h.find_all('section')
            documents = []
            for s in sections:
                download_links.update(set([x.get('href') for x in s.find_all(href=re.compile('^/upload/docs'))]))
            for link in download_links:
                file_name = unquote(link).replace('%20', '').split('/')[-1]
                response = requests.get(url+quote(link))
                with open(file_name, 'wb') as f:
                    f.write(response.content)
                document = {}
                document['source'] = url+link.replace('%20', ' ')
                document['path'] = file_name
                document['name'] = file_name
                document['extension'] = file_name.split('.')[-1]
                document['size'] = len(response.content)
                documents.append(document)
            program['documents'] = documents
            programs.append(program)
        with open('output.json', 'w', encoding="utf-8") as f:
            f.write(json.dumps(programs, indent=2, ensure_ascii=False))
    
    main()
    

  2. I don’t know if I understood what you meant but to "json stringify" an object you can just create the object in question, import the json library and use the dump method of the json module

    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search