The Algorithms logo
算法
关于我们捐赠

电影推荐句子嵌入

H
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "Sentance_Embedding.ipynb",
      "provenance": [],
      "collapsed_sections": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "metadata": {
        "id": "UN3QsCOJj3V4",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "import nltk\n",
        "from nltk.stem.lancaster import LancasterStemmer\n",
        "\n",
        "stemmer = LancasterStemmer()\n",
        "\n",
        "# Import package\n",
        "\n",
        "import tensorflow as tf\n",
        "import json\n",
        "import tflearn\n",
        "import numpy as np \n",
        "import random\n",
        "import pickle"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "PKQPFioif9Lw",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://127.0.0.1:8080/",
          "height": 55
        },
        "outputId": "8054baa6-36ad-4ec3-98e9-623703240098"
      },
      "source": [
        "with open('intents.json') as jsonFile:\n",
        "  data = json.load(jsonFile)\n",
        "\n",
        "print(data['intents'])"
      ],
      "execution_count": 136,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "[{'tag': 'greeting', 'patterns': ['Hi', 'How are you', 'Is anyone there?', 'Hello', 'Good day', 'Whats up'], 'responses': ['Hello!', 'Good to see you again!', 'Hi there, how can I help?'], 'context_set': ''}, {'tag': 'action and advanture', 'patterns': [' Elements of action/adventure (car chases, shootouts, explosions) and thriller', 'Combines action set-pieces with serious themes, character insight and/or emotional power', 'I like action movies', 'i like fighting movies', 'action sequences, such as fighting, stunts, car chases or explosions', 'Fighting is really awesome'], 'responses': ['I also like action movies :)', 'ohh nice', 'yeah action movie is awesome'], 'context_set': ''}, {'tag': 'comedy', 'patterns': ['These films are designed to make the audience laugh through amusement', 'very first silent movies were comedies, as slapstick comedy often relies on visual depictions', 'with many former stand-up comics transitioning to the film industry', 'satirical comedy-drama & the plot is', 'i like comedy movies', 'comedy is really love to watch', 'laughing is the best medicine', 'i love everyday'], 'responses': ['I also like comedy movies :)', 'ohh nice', 'yeah laughing is better for health'], 'context_set': ''}, {'tag': 'Horror', 'patterns': ['A horror film is a film that seeks to elicit fear for entertainment purposes', 'horror has existed as a film genre for more than a century', 'i like Horror movie', 'Horror movies are so thrilled', 'an intense feeling of fear, shock, or disgust.', 'I like intense and fear movies', 'Horror films effectively center on the dark side of life', 'i like dark movies', 'Horror Films are unsettling films designed to frighten and panic,monster'], 'responses': ['ohh i think you like science fiction kind of horror', 'Ohhh i also dark movie', 'Hmm horror movies are so thriilled'], 'context_set': ''}, {'tag': 'Romance', 'patterns': ['romance movies are romantic love stories', 'romance movie focus on passion, emotion, and the affectionate romantic involvement', 'Romance movie strong and pure love and romance the main plot focus', 'i love romantic and love movies', 'romantic love, obsessive love, sentimental love, spiritual love, forbidden love/romance, platonic love, sexual and passionate love, sacrificial love, explosive and destructive love', 'i want to watch Deep and true romantic love between two people', 'i like '], 'responses': ['Romantic movie are just awesome', 'Yeahh i like true love movies', 'Romance is everywhere :)', \"I recommond don't watch with family :(\"], 'context_set': ''}, {'tag': 'Accept', 'patterns': ['ohh really', 'nice', 'ohhh'], 'responses': ['yess', 'yeah', 'yepp', \"yeah that's true\"], 'context_set': ''}]\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "tOnAzP9SgEZJ",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "try:\n",
        "  with open('data.pickle',\"rb\") as f:\n",
        "    words, labels, training, output = pickle.load(f)\n",
        "except:\n",
        "  words = []\n",
        "  labels = []\n",
        "  # doc_x contain pattern of words\n",
        "  docs_x = []   \n",
        "  # doc_y contain pattern of specific tag\n",
        "  docs_y = []\n",
        "\n",
        "  for intent in data[\"intents\"]:\n",
        "    for pattern in intent[\"patterns\"]:\n",
        "      # it's consider only root words by removing unnessary stuff from the sentance\n",
        "      # Use Tokenization : that will help to grab the perticular word from the sentance\n",
        "      # it will return the list which contain all the words in it \n",
        "      # nltk.download('punkt')\n",
        "\n",
        "      wrds = nltk.word_tokenize(pattern)\n",
        "      words.extend(wrds)\n",
        "\n",
        "      # append pattern of words\n",
        "      docs_x.append(wrds)\n",
        "      docs_y.append(intent[\"tag\"])\n",
        "\n",
        "      # append tag in labels list\n",
        "    if intent[\"tag\"] not in labels:\n",
        "      labels.append(intent[\"tag\"])\n",
        "\n",
        "  print(labels)\n",
        "  print(docs_x)\n",
        "  print(docs_y)\n",
        "  # convert all the words into lowercase so that uppercase is not different then lowecase word \n",
        "  unvalid_data = ['?', ')', '(', ',', '.', '&']\n",
        "  words = [stemmer.stem(w.lower()) for w in words if w not in unvalid_data]\n",
        "  print(words)\n",
        "  \n",
        "  # remove duplicate and sort\n",
        "  words = sorted(list(set(words)))\n",
        "  print(words)\n",
        "  \n",
        "  # sort labels\n",
        "  labels = sorted(labels)\n",
        "  print(labels)\n",
        "\n",
        "  # we create a bag of words that will represent a any given pattern\n",
        "  # we create 1 hot encoding which will contain the 1 or 0 based on the word exist or not\n",
        "  # in the sentance \n",
        "\n",
        "  # As neural network only understand numeric value rather then a word that's we need to convert them into numeric encoding\n",
        "\n",
        "  # As bag of words represent by the encoding in the form 0 and 1\n",
        "  training = []\n",
        "  output = []\n",
        "\n",
        "  ## if tag is present then it will be 1 or else 0 ( [0,0,0,0,1,0] in are case we have 6 tag )  \n",
        "  out_empty = [0 for _ in range(len(labels))]\n",
        "  print(out_empty)\n",
        "  for x , doc in enumerate(docs_x):\n",
        "    bag = []\n",
        "\n",
        "    wrds= [stemmer.stem(w) for w in doc]\n",
        "    #print(wrds)\n",
        "    for w in words:\n",
        "      if w in wrds:\n",
        "        bag.append(1)\n",
        "      else:\n",
        "        bag.append(0)\n",
        "    # print(bag)\n",
        "\n",
        "    output_row = out_empty[:]\n",
        "    output_row[labels.index(docs_y[x])] = 1 \n",
        "\n",
        "    # get the training and output\n",
        "    training.append(bag)\n",
        "    output.append(output_row)\n",
        "\n",
        "  training = np.array(training)\n",
        "  output = np.array(output)\n",
        "  #print(training)\n",
        "  #print(output)\n",
        "  with open('data.pickle',\"wb\") as f:\n",
        "    pickle.dump( (words, labels, training, output) , f)\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "irLV8R0ui33I",
        "colab_type": "text"
      },
      "source": [
        "**Tensorflow** "
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "_mHuiaGoR1bd",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# remove warning\n",
        "import warnings\n",
        "warnings.simplefilter('ignore')\n",
        "\n",
        "# work with tensorflow\n",
        "tf.reset_default_graph()\n",
        "\n",
        "# training[0] all list have same len so we can take training[1]\n",
        "net = tflearn.input_data(shape=[None,len(training[0])])\n",
        "\n",
        "# 2 pipes of 8 hidden layer \n",
        "net = tflearn.fully_connected(net,8)\n",
        "net = tflearn.fully_connected(net,8)\n",
        "# activation=\"softmax\" tells probabillity of each neuron in the list (helps to finds the response)\n",
        "net = tflearn.fully_connected(net , len(output[0]), activation=\"softmax\")\n",
        "\n",
        "net = tflearn.regression(net)\n",
        "model = tflearn.DNN(net)\n",
        "\n",
        "# --------- Explanation--------------------\n",
        "\n",
        "#  INPUT DATA  ---> HIDDEN LAYER ---> HIDDEN LAYER ----> OUTPUT DATA \n",
        "#  45 input neurons --> 8 fully connected neurons --> 8 neurons ---> 6 neurons (\"Softmax\") "
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "xzumTFn4ZzRN",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://127.0.0.1:8080/",
          "height": 126
        },
        "outputId": "03d0d838-b16f-4ebb-dc1c-6ee037d6f9f1"
      },
      "source": [
        "# n_epoch means how much time it will se our data\n",
        "# try:\n",
        "#     model.load(\"model.tflearn\")\n",
        "# except:\n",
        "model.fit(training, output, n_epoch=1000, batch_size=8, show_metric=True)\n",
        "model.save(\"model.tflearn\")"
      ],
      "execution_count": 139,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Training Step: 4999  | total loss: \u001b[1m\u001b[32m0.01296\u001b[0m\u001b[0m | time: 0.016s\n",
            "| Adam | epoch: 1000 | loss: 0.01296 - acc: 1.0000 -- iter: 32/39\n",
            "Training Step: 5000  | total loss: \u001b[1m\u001b[32m0.01272\u001b[0m\u001b[0m | time: 0.019s\n",
            "| Adam | epoch: 1000 | loss: 0.01272 - acc: 1.0000 -- iter: 39/39\n",
            "--\n",
            "INFO:tensorflow:/content/model.tflearn is not in all_model_checkpoint_paths. Manually adding it.\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "TVqYlt1Ujhv9",
        "colab_type": "text"
      },
      "source": [
        "**Start prediction**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "HrYt-uKWf20p",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "def beg_of_words(s, words):\n",
        "  # contain 0 \n",
        "  bag = [0 for i in range(len(words))]\n",
        "  \n",
        "  s_words = nltk.word_tokenize(s)\n",
        "  s_words = [stemmer.stem(word.lower()) for word in s_words]\n",
        "\n",
        "  ## sentance (se)\n",
        "  for se in s_words:\n",
        "    for i,w in enumerate(words):\n",
        "      # that mean corrent words which we were looking at present in the sentace\n",
        "      if w == se:\n",
        "        bag[i] = 1\n",
        "  \n",
        "  return np.array(bag) "
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "D_m1XbwdmBTX",
        "colab_type": "text"
      },
      "source": [
        "**Chat Response**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "oY7zlpYjmAXD",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "def chat():\n",
        "  print(\"start talking with the bot (type 'quit' to exit) \")\n",
        "  print(\"\\n\");\n",
        "  while True:\n",
        "    user_input = input(\"Type something 😃 : \")\n",
        "    \n",
        "    if user_input.lower() == 'quit':\n",
        "      break\n",
        "    \n",
        "    # give the predicted response based on the word\n",
        "    result = model.predict([beg_of_words(user_input , words)])\n",
        "    \n",
        "    #index of greated value in the list\n",
        "    result_index = np.argmax(result)\n",
        "    \n",
        "    #print the tag \n",
        "\n",
        "    tag = labels[result_index]\n",
        "    print(\"Movie Genre is {}\".format(tag))\n",
        "\n",
        "    # print the response \n",
        "    for intent in data['intents']:\n",
        "      if tag == intent['tag']:\n",
        "        response = intent['responses']\n",
        "    \n",
        "    print(\"🤖 : {}\".format(random.choice(response)))\n",
        "    print(\"\\n\")\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "RqFWIE6OlE-7",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://127.0.0.1:8080/",
          "height": 545
        },
        "outputId": "60acdefa-aa78-4b12-bf41-7ebac82a1685"
      },
      "source": [
        "chat()"
      ],
      "execution_count": 154,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "start talking with the bot (type 'quit' to exit) \n",
            "\n",
            "\n",
            "Type something 😃 : Hello\n",
            "Movie Genre is greeting\n",
            "🤖 : Hi there, how can I help?\n",
            "\n",
            "\n",
            "Type something 😃 : I love romantic movie\n",
            "Movie Genre is Romance\n",
            "🤖 : Yeahh i like true love movies\n",
            "\n",
            "\n",
            "Type something 😃 : i fear watching horror movie\n",
            "Movie Genre is Horror\n",
            "🤖 : Hmm horror movies are so thriilled\n",
            "\n",
            "\n",
            "Type something 😃 : i like fighting and stunts movies\n",
            "Movie Genre is action and advanture\n",
            "🤖 : ohh nice\n",
            "\n",
            "\n",
            "Type something 😃 : i love to watch comedy movies\n",
            "Movie Genre is comedy\n",
            "🤖 : ohh nice\n",
            "\n",
            "\n",
            "Type something 😃 : quit\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "iYveRNbYnZL3",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        ""
      ],
      "execution_count": 0,
      "outputs": []
    }
  ]
}
关于此算法
import nltk
from nltk.stem.lancaster import LancasterStemmer

stemmer = LancasterStemmer()

# Import package

import tensorflow as tf
import json
import tflearn
import numpy as np 
import random
import pickle
with open('intents.json') as jsonFile:
  data = json.load(jsonFile)

print(data['intents'])
[{'tag': 'greeting', 'patterns': ['Hi', 'How are you', 'Is anyone there?', 'Hello', 'Good day', 'Whats up'], 'responses': ['Hello!', 'Good to see you again!', 'Hi there, how can I help?'], 'context_set': ''}, {'tag': 'action and advanture', 'patterns': [' Elements of action/adventure (car chases, shootouts, explosions) and thriller', 'Combines action set-pieces with serious themes, character insight and/or emotional power', 'I like action movies', 'i like fighting movies', 'action sequences, such as fighting, stunts, car chases or explosions', 'Fighting is really awesome'], 'responses': ['I also like action movies :)', 'ohh nice', 'yeah action movie is awesome'], 'context_set': ''}, {'tag': 'comedy', 'patterns': ['These films are designed to make the audience laugh through amusement', 'very first silent movies were comedies, as slapstick comedy often relies on visual depictions', 'with many former stand-up comics transitioning to the film industry', 'satirical comedy-drama & the plot is', 'i like comedy movies', 'comedy is really love to watch', 'laughing is the best medicine', 'i love everyday'], 'responses': ['I also like comedy movies :)', 'ohh nice', 'yeah laughing is better for health'], 'context_set': ''}, {'tag': 'Horror', 'patterns': ['A horror film is a film that seeks to elicit fear for entertainment purposes', 'horror has existed as a film genre for more than a century', 'i like Horror movie', 'Horror movies are so thrilled', 'an intense feeling of fear, shock, or disgust.', 'I like intense and fear movies', 'Horror films effectively center on the dark side of life', 'i like dark movies', 'Horror Films are unsettling films designed to frighten and panic,monster'], 'responses': ['ohh i think you like science fiction kind of horror', 'Ohhh i also dark movie', 'Hmm horror movies are so thriilled'], 'context_set': ''}, {'tag': 'Romance', 'patterns': ['romance movies are romantic love stories', 'romance movie focus on passion, emotion, and the affectionate romantic involvement', 'Romance movie strong and pure love and romance the main plot focus', 'i love romantic and love movies', 'romantic love, obsessive love, sentimental love, spiritual love, forbidden love/romance, platonic love, sexual and passionate love, sacrificial love, explosive and destructive love', 'i want to watch Deep and true romantic love between two people', 'i like '], 'responses': ['Romantic movie are just awesome', 'Yeahh i like true love movies', 'Romance is everywhere :)', "I recommond don't watch with family :("], 'context_set': ''}, {'tag': 'Accept', 'patterns': ['ohh really', 'nice', 'ohhh'], 'responses': ['yess', 'yeah', 'yepp', "yeah that's true"], 'context_set': ''}]
try:
  with open('data.pickle',"rb") as f:
    words, labels, training, output = pickle.load(f)
except:
  words = []
  labels = []
  # doc_x contain pattern of words
  docs_x = []   
  # doc_y contain pattern of specific tag
  docs_y = []

  for intent in data["intents"]:
    for pattern in intent["patterns"]:
      # it's consider only root words by removing unnessary stuff from the sentance
      # Use Tokenization : that will help to grab the perticular word from the sentance
      # it will return the list which contain all the words in it 
      # nltk.download('punkt')

      wrds = nltk.word_tokenize(pattern)
      words.extend(wrds)

      # append pattern of words
      docs_x.append(wrds)
      docs_y.append(intent["tag"])

      # append tag in labels list
    if intent["tag"] not in labels:
      labels.append(intent["tag"])

  print(labels)
  print(docs_x)
  print(docs_y)
  # convert all the words into lowercase so that uppercase is not different then lowecase word 
  unvalid_data = ['?', ')', '(', ',', '.', '&']
  words = [stemmer.stem(w.lower()) for w in words if w not in unvalid_data]
  print(words)
  
  # remove duplicate and sort
  words = sorted(list(set(words)))
  print(words)
  
  # sort labels
  labels = sorted(labels)
  print(labels)

  # we create a bag of words that will represent a any given pattern
  # we create 1 hot encoding which will contain the 1 or 0 based on the word exist or not
  # in the sentance 

  # As neural network only understand numeric value rather then a word that's we need to convert them into numeric encoding

  # As bag of words represent by the encoding in the form 0 and 1
  training = []
  output = []

  ## if tag is present then it will be 1 or else 0 ( [0,0,0,0,1,0] in are case we have 6 tag )  
  out_empty = [0 for _ in range(len(labels))]
  print(out_empty)
  for x , doc in enumerate(docs_x):
    bag = []

    wrds= [stemmer.stem(w) for w in doc]
    #print(wrds)
    for w in words:
      if w in wrds:
        bag.append(1)
      else:
        bag.append(0)
    # print(bag)

    output_row = out_empty[:]
    output_row[labels.index(docs_y[x])] = 1 

    # get the training and output
    training.append(bag)
    output.append(output_row)

  training = np.array(training)
  output = np.array(output)
  #print(training)
  #print(output)
  with open('data.pickle',"wb") as f:
    pickle.dump( (words, labels, training, output) , f)

Tensorflow

# remove warning
import warnings
warnings.simplefilter('ignore')

# work with tensorflow
tf.reset_default_graph()

# training[0] all list have same len so we can take training[1]
net = tflearn.input_data(shape=[None,len(training[0])])

# 2 pipes of 8 hidden layer 
net = tflearn.fully_connected(net,8)
net = tflearn.fully_connected(net,8)
# activation="softmax" tells probabillity of each neuron in the list (helps to finds the response)
net = tflearn.fully_connected(net , len(output[0]), activation="softmax")

net = tflearn.regression(net)
model = tflearn.DNN(net)

# --------- Explanation--------------------

#  INPUT DATA  ---> HIDDEN LAYER ---> HIDDEN LAYER ----> OUTPUT DATA 
#  45 input neurons --> 8 fully connected neurons --> 8 neurons ---> 6 neurons ("Softmax") 
# n_epoch means how much time it will se our data
# try:
#     model.load("model.tflearn")
# except:
model.fit(training, output, n_epoch=1000, batch_size=8, show_metric=True)
model.save("model.tflearn")
Training Step: 4999  | total loss: <span style="font-weight:bold;color:rgb(0,187,0)">0.01296</span> | time: 0.016s
| Adam | epoch: 1000 | loss: 0.01296 - acc: 1.0000 -- iter: 32/39
Training Step: 5000  | total loss: <span style="font-weight:bold;color:rgb(0,187,0)">0.01272</span> | time: 0.019s
| Adam | epoch: 1000 | loss: 0.01272 - acc: 1.0000 -- iter: 39/39
--
INFO:tensorflow:/content/model.tflearn is not in all_model_checkpoint_paths. Manually adding it.

开始预测

def beg_of_words(s, words):
  # contain 0 
  bag = [0 for i in range(len(words))]
  
  s_words = nltk.word_tokenize(s)
  s_words = [stemmer.stem(word.lower()) for word in s_words]

  ## sentance (se)
  for se in s_words:
    for i,w in enumerate(words):
      # that mean corrent words which we were looking at present in the sentace
      if w == se:
        bag[i] = 1
  
  return np.array(bag) 

聊天回复

def chat():
  print("start talking with the bot (type 'quit' to exit) ")
  print("\n");
  while True:
    user_input = input("Type something 😃 : ")
    
    if user_input.lower() == 'quit':
      break
    
    # give the predicted response based on the word
    result = model.predict([beg_of_words(user_input , words)])
    
    #index of greated value in the list
    result_index = np.argmax(result)
    
    #print the tag 

    tag = labels[result_index]
    print("Movie Genre is {}".format(tag))

    # print the response 
    for intent in data['intents']:
      if tag == intent['tag']:
        response = intent['responses']
    
    print("🤖 : {}".format(random.choice(response)))
    print("\n")
chat()
start talking with the bot (type &#x27;quit&#x27; to exit) 


Type something 😃 : Hello
Movie Genre is greeting
🤖 : Hi there, how can I help?


Type something 😃 : I love romantic movie
Movie Genre is Romance
🤖 : Yeahh i like true love movies


Type something 😃 : i fear watching horror movie
Movie Genre is Horror
🤖 : Hmm horror movies are so thriilled


Type something 😃 : i like fighting and stunts movies
Movie Genre is action and advanture
🤖 : ohh nice


Type something 😃 : i love to watch comedy movies
Movie Genre is comedy
🤖 : ohh nice


Type something 😃 : quit